diff --git a/docs/developers/internal.md b/docs/developers/internal.md index 091abf9d..7b68b413 100644 --- a/docs/developers/internal.md +++ b/docs/developers/internal.md @@ -33,3 +33,28 @@ Unit tests were written for pytest, which can be installed with pip and run from ```bash pytest ``` + + +## Documentation + +These pages are generated with mkdocs. + +To set things up, perform the following steps (substitute name of venv if needed). + +``` +python3 -m venv venvhpoo +source venvhpo/bin/activate +pip install --upgrade pip +pip install mkdocs +pip install mkdocs-material +pip install mkdocs-material[imaging] +pip install pillow cairosvg +pip install mkdocs-material-extensions +pip install mkdocstrings[python] +``` + +To start a local server, enter: +``` +mkdocs serve +``` + \ No newline at end of file diff --git a/docs/user-guide/discombobulator.md b/docs/user-guide/discombobulator.md new file mode 100644 index 00000000..fdae6756 --- /dev/null +++ b/docs/user-guide/discombobulator.md @@ -0,0 +1,42 @@ +# Discombobulator + + +Sometimes tables contain many different related items. For instance, the following box shows the contents of one cell in a table from [PMID:30057029](https://pubmed.ncbi.nlm.nih.gov/30057029/){:target="_blank"} in a column entitled "dysmorpholoy". + + +!!! dysmorphology + + frontal bossing, curled hair, highly arched and sparse eyebrows, long eyelashes, downslanting palpebral fissures, depressed nasal ridge, tented upper lip + +While we could create annotations by hand and create one column for each entry in this cell (and all of the other entries in the column), it can be error-prone and time consuming. Therefore, pyphetools has a (currently experimental) feature called "discombobulation", then takes all of the entries in such a cell from each cell in a column, and creates corresponding columns and rows for the standard Excel template file. To do this, we create the following python code. + + +```python +from pyphetools.creation import Discombobulator, HpoParser +import pandas as pd +parser = HpoParser() +hp_cr = parser.get_hpo_concept_recognizer() +disco = Discombobulator(hpo_cr=hp_cr) +``` + +This creates a Discombobulator object that can be used for all of the relevant columns of the original supplemental file. Assuming for this example that the original file is called "temp.xslx" and the column of interest is called "face", we would use the following python code. + + +```python +df = pd.read_excel("temp.xlsx") +df_face = disco.decode(df=df, column="face", assumeExcluded=True) +df_face.head(3) +``` + +The assumeExcluded argument determines if we call a feature to be absent if it is not mentioned in a certain cell but is mentioned in another cell in the same column. This assumption seems justified for dysmorphology features if the authors state a full examination was conducted. + + +For now, this function operates one column at a time. We can save the results in an excel file and manually add them to the template file. + +```python +df_face.to_excel("temp_face.xlsx") +``` + +This functionality is currently in an experimental stage and we are exploring ways to make its use easier. We do not recommend using the Decombobualtor unless you are very comfortable with Python and Excel. + +There is no need to keep the temporary excel files or python code after creating the main template file. \ No newline at end of file diff --git a/docs/user-guide/python_notebook.md b/docs/user-guide/python_notebook.md index ca41d61b..401d1609 100644 --- a/docs/user-guide/python_notebook.md +++ b/docs/user-guide/python_notebook.md @@ -8,7 +8,7 @@ The following sections explain how to use Python code to create phenopackets fro We first import the TemplateImporter to import the data and create phenopackets, and several classes to visualize the data. ```python -from pyphetools.creation import TemplateImporter +from pyphetools.creation import TemplateImporter, Moi from pyphetools.visualization import IndividualTable, QcVisualizer from IPython.display import display, HTML import pyphetools @@ -78,11 +78,11 @@ the mode of inheritance (MOI) and then indicate the MOI. If multiple distinct di Check results of variant encoding. ```python pmid = "PMID:36333996" -timporter.create_hpoa_from_phenopackets(pmid=pmid, moi="Autosomal recessive") +df = timporter.create_hpoa_from_phenopackets(pmid=pmid, mode_of_inheritance=Moi.AR) ``` or ```python pmid = "PMID:36333996" -timporter.create_hpoa_from_phenopackets(pmid=pmid, moi="Autosomal recessive", target="OMIM:620427") +df = timporter.create_hpoa_from_phenopackets(pmid=pmid, mode_of_inheritance=Moi.AD, target="OMIM:620427") ``` diff --git a/mkdocs.yml b/mkdocs.yml index 26e4e81b..1dac490e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -37,6 +37,7 @@ nav: - 'user-guide/python_notebook.md' - 'user-guide/tips_for_curation.md' - 'user-guide/variant_notation.md' + - 'user-guide/discombobulator.md' - Coding tabular data with Python scripts: - Overview: 'tabular/overview.md' - Jupyter notebooks: 'tabular/jupyter.md' diff --git a/notebooks/Example_1_TRPM3_PMID_31278393.ipynb b/notebooks/Example_1_TRPM3_PMID_31278393.ipynb index 3d323094..0b52b712 100644 --- a/notebooks/Example_1_TRPM3_PMID_31278393.ipynb +++ b/notebooks/Example_1_TRPM3_PMID_31278393.ipynb @@ -19,7 +19,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "pyphetools version 0.8.2\n" + "pyphetools version 0.9.77\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/robin/GIT/pyphetools/ppt_venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" ] } ], @@ -27,12 +35,13 @@ "import phenopackets as php\n", "from google.protobuf.json_format import MessageToDict, MessageToJson\n", "from google.protobuf.json_format import Parse, ParseDict\n", + "from IPython.display import display, HTML\n", "import pandas as pd\n", "pd.set_option('display.max_colwidth', None) # show entire column contents, important!\n", "from collections import defaultdict\n", "import pyphetools\n", "from pyphetools.creation import *\n", - "from pyphetools.validation import ContentValidator\n", + "from pyphetools.validation import CohortValidator\n", "from pyphetools.visualization import *\n", "\n", "print(f\"pyphetools version {pyphetools.__version__}\")" @@ -48,18 +57,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "HPO version 2023-10-09\n" + "HPO version 2024-04-26\n" ] } ], "source": [ "# Import HPO data\n", "parser = HpoParser()\n", + "hpo_ontology = parser.get_ontology()\n", "hpo_cr = parser.get_hpo_concept_recognizer()\n", "hpo_version = parser.get_version()\n", "PMID = \"PMID:31278393\"\n", "title = \"De novo substitutions of TRPM3 cause intellectual disability and epilepsy\"\n", - "metadata = MetaData(created_by=\"ORCID:0000-0002-0736-9199\", pmid=PMID, pubmed_title=title)\n", + "cite = Citation(pmid=PMID, title=title)\n", + "metadata = MetaData(created_by=\"ORCID:0000-0002-0736-9199\", citation=cite)\n", "metadata.default_versions_with_hpo(version=hpo_version)\n", "print(f\"HPO version {hpo_version}\")" ] @@ -69,16 +80,6 @@ "execution_count": 3, "id": "af4bbb16", "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_excel('data/PMID_31278393.xlsx')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9bb665c1", - "metadata": {}, "outputs": [ { "data": { @@ -137,722 +138,282 @@ " p.(Val837Met)\n", " p.(Pro937Gln)\n", " \n", - " \n", - " 2\n", - " Genomic DNA (NC_000009.11)\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73213379C>T\n", - " g.73168145G>T\n", + " \n", + "\n", + "" + ], + "text/plain": [ + " Individual 1 2 3 \\\n", + "0 cDNA (NM_020952.4) c.2509G>A c.2509G>A c.2509G>A \n", + "1 Polypeptide (NP_066003.3) p.(Val837Met) p.(Val837Met) p.(Val837Met) \n", + "\n", + " 4 5 6 7 \\\n", + "0 c.2509G>A c.2509G>A c.2509G>A c.2509G>A \n", + "1 p.(Val837Met) p.(Val837Met) p.(Val837Met) p.(Val837Met) \n", + "\n", + " 8 \n", + "0 c.2810C>A \n", + "1 p.(Pro937Gln) " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_excel('data/PMID_31278393.xlsx')\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f3068215", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", "
IndividualcDNA (NM_020952.4)Polypeptide (NP_066003.3)Genomic DNA (NC_000009.11)ZygositySegregationClinical featuresGestation (weeks)Perinatal historyBirth weight (kg)Sex...Craniofacial gestaltMorphological featuresOther clinical featuresBrain MRIApparent heat or pain insensitivityGenetic investigationsaCGHFragile XOther (nondiagnostic) genetic investigationsindividual_id
3ZygosityHeterozygousHeterozygousHeterozygousHeterozygousHeterozygousHeterozygousHeterozygous1c.2509G>Ap.(Val837Met)g.73213379C>THeterozygous
4SegregationDe novoDe novoDe novoDe novoDe novoDe novoDe novoDe novo
5Clinical featuresNaNNaNNaNNaNNaNNaNNaN38C/SNRM...NondysmorphicBroad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingersC1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU)Possible mild cerebral volume loss+ (Heat)NaNNormalNormalID panel (170 genes), PHF61
6Gestation (weeks)3840423938 + 32c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN4039Term
7Perinatal historyC/SNNNNNC/SC/S (repeat)
8Birth weight (kg)NR3.63.23.483.3783.893.12.9
9SexMMFMMMMF
10Age (years)164.7565.96.2528388.1
11Height (cm)164.5 (Z = −1.0)105.1 (Z = −0.7)110 (Z = −1.0)117 (Z = + 0.3)116 (Z = −0.3)...NondysmorphicShort philtrum, long nose, turricephalyEMG/NCS normalNormalNR169.5 (Z = −1.3)115.7 (Z = −2.0)
12Weight (kg)73.3 (Z = + 1.0)17.6 (Z = −0.1)17.8 (Z = −0.9)24.5 (Z = + 1.4)22 (Z = + 0.3)NaNNormalNormalNR63.2 (Z = −1.0)27.8 (Z = + 0.6)2
13BMI (kg/m2)27.1 (Z = + 1.8)15.9 (Z = + 0.5)14.7 (Z = −0.4)17.9 (Z = + 1.7)16.3 (Z = + 0.7)3c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN42N3.2F...NondysmorphicNR22.1 (Z = −0.0)22.3 (Z = + 2.0)
14OFC (cm)55.8 (15 years, 8 months) (Z = + 0.2)49.5 (Z = −0.8)51 (Z = + 0.2)55 (Z = + 2.1)53.2 (Z = + 0.7)56 (Z = 0)57 (Z = + 0.5)52 (Z = + 0.2)
15Developmental delay/intellectual disability+ (Severe)+ (Moderate)+ (Moderate-to-severe)+ (Severe)+ (Severe)+ (Severe)+ (Moderate)+ (Moderate-to-severe)
16Ambulate independently (age achieved)+ (5 years)+ (With walker) (3 years)+ (With walker)+ (4.5 years)+ (4 years)+ (3.5 years)NormalNRNaNNormalNormalMECP2, SMA3
17Any speech (age attained)+ (5 years)+ (5 years)+ (2.5 years)4c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN39N3.48M...NRBroad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatumStrabismusNormalNRNaNNormalNormalNR4
18Combine words/signs++ (Signs)+ (Sentences)
19Toilet independently (age attained)+ (9 years)NR(4 years)
20Autism-like features+NR+++NR
21Electrographically confirmed seizures++++++Unconfirmed+
22Seizure typesAbsenceInfantile spasmsGTCSubclinical, including ESESNRAbsence and GTCAbsenceAbsence
23Age of first clinical seizureAbsence-like episodes in infancy; first documented EEG abnormalities at 7 yearsNRNREEG abnormalities at 3 years11 months9 months<1 year2 years
24Current anticonvulsant therapyLevetiracetam (initial); none (current)NRNRDiazepam qHS (with improvement in ESES)LevetiracetamNoneLamotrigine
25Age of last clinical seizureNR (untreated follow-up EEG at age 15 was normal)NRNRNR5 years, 9 months26 years (EEG remains pathological with diffuse high-amplitude activity)NR6 years
26Hypotonia++++++ (mixed tone abnormality)+
27Craniofacial gestaltNondysmorphicNondysmorphicNondysmorphicNRNRNRDistinctiveNondysmorphic
28Morphological featuresBroad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingersShort philtrum, long nose, turricephaly5c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN38 + 3N3.378M...NRBroad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatumBroad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbsMicrognathia, high palateMild facial asymmetry, ptosis, telecanthus, bulbous nasal tip, micrognathia, short neck, low hairlineBroad forehead, deeply set eyes, upslanting palpebral fissures, bulbous/upturned nasal tip, short philtrum, large oral aperture, facial capillary hemangioma
29Other clinical featuresC1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU)EMG/NCS normalStrabismusCryptorchidism, micropenis, bilateral talipes equinovarusNeonatal hypoglycemia; unilateral hip dysplasia; scoliosisAtlanto-occipital fusion, strabismus (exotropia), hands held ‘fisted’ until 9 months, athetoid movements in infancy, pes planusChoreoathetoid movements in infancy (age 5 months), strabismus, scoliosis
30Brain MRIPossible mild cerebral volume lossNormalNormalNormalVentriculomegaly, nonspecific periventricular white matter hyperintensitiesNormalNormalNormal
31Apparent heat or pain insensitivity+ (Heat)NRNRNR+ (Pain)NRNR
32Genetic investigationsNaNNaNNaNNaNNaNNaNNaN
33aCGHNormalNormalNormalNormalNormalNormalNormalNormal
34Fragile XNormalNormalNormalNormalN/ANormalNormalN/A
35Other (nondiagnostic) genetic investigationsID panel (170 genes), PHF6NRMECP2, SMANRNRNR9 gene XLID panel, MCT8mtDNA, POLG, DGUOK, TK2, SUCLA2, ID panel (196 genes)5
\n", + "

5 rows × 37 columns

\n", "
" ], "text/plain": [ - " Individual \\\n", - "0 cDNA (NM_020952.4) \n", - "1 Polypeptide (NP_066003.3) \n", - "2 Genomic DNA (NC_000009.11) \n", - "3 Zygosity \n", - "4 Segregation \n", - "5 Clinical features \n", - "6 Gestation (weeks) \n", - "7 Perinatal history \n", - "8 Birth weight (kg) \n", - "9 Sex \n", - "10 Age (years) \n", - "11 Height (cm) \n", - "12 Weight (kg) \n", - "13 BMI (kg/m2) \n", - "14 OFC (cm) \n", - "15 Developmental delay/intellectual disability \n", - "16 Ambulate independently (age achieved) \n", - "17 Any speech (age attained) \n", - "18 Combine words/signs \n", - "19 Toilet independently (age attained) \n", - "20 Autism-like features \n", - "21 Electrographically confirmed seizures \n", - "22 Seizure types \n", - "23 Age of first clinical seizure \n", - "24 Current anticonvulsant therapy \n", - "25 Age of last clinical seizure \n", - "26 Hypotonia \n", - "27 Craniofacial gestalt \n", - "28 Morphological features \n", - "29 Other clinical features \n", - "30 Brain MRI \n", - "31 Apparent heat or pain insensitivity \n", - "32 Genetic investigations \n", - "33 aCGH \n", - "34 Fragile X \n", - "35 Other (nondiagnostic) genetic investigations \n", + "Individual cDNA (NM_020952.4) Polypeptide (NP_066003.3) \\\n", + "1 c.2509G>A p.(Val837Met) \n", + "2 c.2509G>A p.(Val837Met) \n", + "3 c.2509G>A p.(Val837Met) \n", + "4 c.2509G>A p.(Val837Met) \n", + "5 c.2509G>A p.(Val837Met) \n", + "\n", + "Individual Genomic DNA (NC_000009.11) Zygosity Segregation \\\n", + "1 g.73213379C>T Heterozygous De novo \n", + "2 g.73213379C>T Heterozygous De novo \n", + "3 g.73213379C>T Heterozygous De novo \n", + "4 g.73213379C>T Heterozygous De novo \n", + "5 g.73213379C>T Heterozygous De novo \n", + "\n", + "Individual Clinical features Gestation (weeks) Perinatal history \\\n", + "1 NaN 38 C/S \n", + "2 NaN 40 N \n", + "3 NaN 42 N \n", + "4 NaN 39 N \n", + "5 NaN 38 + 3 N \n", + "\n", + "Individual Birth weight (kg) Sex ... Craniofacial gestalt \\\n", + "1 NR M ... Nondysmorphic \n", + "2 3.6 M ... Nondysmorphic \n", + "3 3.2 F ... Nondysmorphic \n", + "4 3.48 M ... NR \n", + "5 3.378 M ... NR \n", "\n", - " 1 \\\n", - "0 c.2509G>A \n", - "1 p.(Val837Met) \n", - "2 g.73213379C>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 38 \n", - "7 C/S \n", - "8 NR \n", - "9 M \n", - "10 16 \n", - "11 164.5 (Z = −1.0) \n", - "12 73.3 (Z = + 1.0) \n", - "13 27.1 (Z = + 1.8) \n", - "14 55.8 (15 years, 8 months) (Z = + 0.2) \n", - "15 + (Severe) \n", - "16 + (5 years) \n", - "17 + (5 years) \n", - "18 + \n", - "19 + (9 years) \n", - "20 + \n", - "21 + \n", - "22 Absence \n", - "23 Absence-like episodes in infancy; first documented EEG abnormalities at 7 years \n", - "24 Levetiracetam (initial); none (current) \n", - "25 NR (untreated follow-up EEG at age 15 was normal) \n", - "26 + \n", - "27 Nondysmorphic \n", - "28 Broad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingers \n", - "29 C1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU) \n", - "30 Possible mild cerebral volume loss \n", - "31 + (Heat) \n", - "32 NaN \n", - "33 Normal \n", - "34 Normal \n", - "35 ID panel (170 genes), PHF6 \n", + "Individual Morphological features \\\n", + "1 Broad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingers \n", + "2 Short philtrum, long nose, turricephaly \n", + "3 NR \n", + "4 Broad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatum \n", + "5 Broad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbs \n", "\n", - " 2 3 \\\n", - "0 c.2509G>A c.2509G>A \n", - "1 p.(Val837Met) p.(Val837Met) \n", - "2 g.73213379C>T g.73213379C>T \n", - "3 Heterozygous Heterozygous \n", - "4 De novo De novo \n", - "5 NaN NaN \n", - "6 40 42 \n", - "7 N N \n", - "8 3.6 3.2 \n", - "9 M F \n", - "10 4.75 6 \n", - "11 105.1 (Z = −0.7) 110 (Z = −1.0) \n", - "12 17.6 (Z = −0.1) 17.8 (Z = −0.9) \n", - "13 15.9 (Z = + 0.5) 14.7 (Z = −0.4) \n", - "14 49.5 (Z = −0.8) 51 (Z = + 0.2) \n", - "15 + (Moderate) + (Moderate-to-severe) \n", - "16 + (With walker) (3 years) − \n", - "17 − − \n", - "18 − − \n", - "19 − − \n", - "20 NR + \n", - "21 + + \n", - "22 Infantile spasms GTC \n", - "23 NR NR \n", - "24 NR NR \n", - "25 NR NR \n", - "26 + + \n", - "27 Nondysmorphic Nondysmorphic \n", - "28 Short philtrum, long nose, turricephaly NR \n", - "29 EMG/NCS normal − \n", - "30 Normal Normal \n", - "31 NR NR \n", - "32 NaN NaN \n", - "33 Normal Normal \n", - "34 Normal Normal \n", - "35 NR MECP2, SMA \n", + "Individual Other clinical features \\\n", + "1 C1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU) \n", + "2 EMG/NCS normal \n", + "3 − \n", + "4 Strabismus \n", + "5 Cryptorchidism, micropenis, bilateral talipes equinovarus \n", "\n", - " 4 \\\n", - "0 c.2509G>A \n", - "1 p.(Val837Met) \n", - "2 g.73213379C>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 39 \n", - "7 N \n", - "8 3.48 \n", - "9 M \n", - "10 5.9 \n", - "11 117 (Z = + 0.3) \n", - "12 24.5 (Z = + 1.4) \n", - "13 17.9 (Z = + 1.7) \n", - "14 55 (Z = + 2.1) \n", - "15 + (Severe) \n", - "16 + (With walker) \n", - "17 − \n", - "18 − \n", - "19 − \n", - "20 + \n", - "21 + \n", - "22 Subclinical, including ESES \n", - "23 EEG abnormalities at 3 years \n", - "24 Diazepam qHS (with improvement in ESES) \n", - "25 NR \n", - "26 + \n", - "27 NR \n", - "28 Broad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatum \n", - "29 Strabismus \n", - "30 Normal \n", - "31 NR \n", - "32 NaN \n", - "33 Normal \n", - "34 Normal \n", - "35 NR \n", + "Individual Brain MRI \\\n", + "1 Possible mild cerebral volume loss \n", + "2 Normal \n", + "3 Normal \n", + "4 Normal \n", + "5 Ventriculomegaly, nonspecific periventricular white matter hyperintensities \n", "\n", - " 5 \\\n", - "0 c.2509G>A \n", - "1 p.(Val837Met) \n", - "2 g.73213379C>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 38 + 3 \n", - "7 N \n", - "8 3.378 \n", - "9 M \n", - "10 6.25 \n", - "11 116 (Z = −0.3) \n", - "12 22 (Z = + 0.3) \n", - "13 16.3 (Z = + 0.7) \n", - "14 53.2 (Z = + 0.7) \n", - "15 + (Severe) \n", - "16 + (4.5 years) \n", - "17 − \n", - "18 − \n", - "19 − \n", - "20 + \n", - "21 + \n", - "22 NR \n", - "23 11 months \n", - "24 Levetiracetam \n", - "25 5 years, 9 months \n", - "26 + \n", - "27 NR \n", - "28 Broad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbs \n", - "29 Cryptorchidism, micropenis, bilateral talipes equinovarus \n", - "30 Ventriculomegaly, nonspecific periventricular white matter hyperintensities \n", - "31 + (Pain) \n", - "32 NaN \n", - "33 Normal \n", - "34 N/A \n", - "35 NR \n", + "Individual Apparent heat or pain insensitivity Genetic investigations \\\n", + "1 + (Heat) NaN \n", + "2 NR NaN \n", + "3 NR NaN \n", + "4 NR NaN \n", + "5 + (Pain) NaN \n", "\n", - " 6 \\\n", - "0 c.2509G>A \n", - "1 p.(Val837Met) \n", - "2 g.73213379C>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 40 \n", - "7 N \n", - "8 3.89 \n", - "9 M \n", - "10 28 \n", - "11 NR \n", - "12 NR \n", - "13 NR \n", - "14 56 (Z = 0) \n", - "15 + (Severe) \n", - "16 − \n", - "17 − \n", - "18 − \n", - "19 − \n", - "20 − \n", - "21 + \n", - "22 Absence and GTC \n", - "23 9 months \n", - "24   \n", - "25 26 years (EEG remains pathological with diffuse high-amplitude activity) \n", - "26 − \n", - "27 NR \n", - "28 Micrognathia, high palate \n", - "29 Neonatal hypoglycemia; unilateral hip dysplasia; scoliosis \n", - "30 Normal \n", - "31 − \n", - "32 NaN \n", - "33 Normal \n", - "34 Normal \n", - "35 NR \n", + "Individual aCGH Fragile X Other (nondiagnostic) genetic investigations \\\n", + "1 Normal Normal ID panel (170 genes), PHF6 \n", + "2 Normal Normal NR \n", + "3 Normal Normal MECP2, SMA \n", + "4 Normal Normal NR \n", + "5 Normal N/A NR \n", "\n", - " 7 \\\n", - "0 c.2509G>A \n", - "1 p.(Val837Met) \n", - "2 g.73213379C>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 39 \n", - "7 C/S \n", - "8 3.1 \n", - "9 M \n", - "10 38 \n", - "11 169.5 (Z = −1.3) \n", - "12 63.2 (Z = −1.0) \n", - "13 22.1 (Z = −0.0) \n", - "14 57 (Z = + 0.5) \n", - "15 + (Moderate) \n", - "16 + (4 years) \n", - "17 + (5 years) \n", - "18 + (Signs) \n", - "19 NR \n", - "20 NR \n", - "21 Unconfirmed \n", - "22 Absence \n", - "23 <1 year \n", - "24 None \n", - "25 NR \n", - "26 + (mixed tone abnormality) \n", - "27 Distinctive \n", - "28 Mild facial asymmetry, ptosis, telecanthus, bulbous nasal tip, micrognathia, short neck, low hairline \n", - "29 Atlanto-occipital fusion, strabismus (exotropia), hands held ‘fisted’ until 9 months, athetoid movements in infancy, pes planus \n", - "30 Normal \n", - "31 NR \n", - "32 NaN \n", - "33 Normal \n", - "34 Normal \n", - "35 9 gene XLID panel, MCT8 \n", + "Individual individual_id \n", + "1 1 \n", + "2 2 \n", + "3 3 \n", + "4 4 \n", + "5 5 \n", "\n", - " 8 \n", - "0 c.2810C>A \n", - "1 p.(Pro937Gln) \n", - "2 g.73168145G>T \n", - "3 Heterozygous \n", - "4 De novo \n", - "5 NaN \n", - "6 Term \n", - "7 C/S (repeat) \n", - "8 2.9 \n", - "9 F \n", - "10 8.1 \n", - "11 115.7 (Z = −2.0) \n", - "12 27.8 (Z = + 0.6) \n", - "13 22.3 (Z = + 2.0) \n", - "14 52 (Z = + 0.2) \n", - "15 + (Moderate-to-severe) \n", - "16 + (3.5 years) \n", - "17 + (2.5 years) \n", - "18 + (Sentences) \n", - "19 (4 years) \n", - "20 − \n", - "21 + \n", - "22 Absence \n", - "23 2 years \n", - "24 Lamotrigine \n", - "25 6 years \n", - "26 + \n", - "27 Nondysmorphic \n", - "28 Broad forehead, deeply set eyes, upslanting palpebral fissures, bulbous/upturned nasal tip, short philtrum, large oral aperture, facial capillary hemangioma \n", - "29 Choreoathetoid movements in infancy (age 5 months), strabismus, scoliosis \n", - "30 Normal \n", - "31 NR \n", - "32   \n", - "33 Normal \n", - "34 N/A \n", - "35 mtDNA, POLG, DGUOK, TK2, SUCLA2, ID panel (196 genes) " + "[5 rows x 37 columns]" ] }, "execution_count": 4, @@ -861,13 +422,39 @@ } ], "source": [ - "df" + "# Convert to row based format\n", + "dft = df.transpose()\n", + "dft.columns = dft.iloc[0]\n", + "dft.drop(dft.index[0], inplace=True)\n", + "dft.head()\n", + "# Note that the Individual is now the row index but we need it to be available as a column\n", + "# Therefore, add it as an explicit, new column\n", + "dft['individual_id'] = dft.index\n", + "dft.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ccff0a96", + "metadata": {}, + "source": [ + "

Column mappers

" ] }, { "cell_type": "code", "execution_count": 5, - "id": "f3068215", + "id": "780039a6", + "metadata": {}, + "outputs": [], + "source": [ + "column_mapper_list = list()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fbceaf13", "metadata": {}, "outputs": [ { @@ -890,444 +477,821 @@ "\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IndividualcDNA (NM_020952.4)Polypeptide (NP_066003.3)Genomic DNA (NC_000009.11)ZygositySegregationClinical featuresGestation (weeks)Perinatal historyBirth weight (kg)Sex...Craniofacial gestaltMorphological featuresOther clinical featuresBrain MRIApparent heat or pain insensitivityGenetic investigationsaCGHFragile XOther (nondiagnostic) genetic investigationspatient_idmappingcount
1c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN38C/SNRM...NondysmorphicBroad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingersC1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU)Possible mild cerebral volume loss+ (Heat)NaNNormalNormalID panel (170 genes), PHF610Intellectual disability, severe (HP:0010864) (observed)4
2c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN40N3.6M...NondysmorphicShort philtrum, long nose, turricephalyEMG/NCS normalNormalNRNaNNormalNormalNR1Intellectual disability, moderate (HP:0002342) (observed)4
\n", + "" + ], + "text/plain": [ + " mapping count\n", + "0 Intellectual disability, severe (HP:0010864) (observed) 4\n", + "1 Intellectual disability, moderate (HP:0002342) (observed) 4" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Developmental delay/intellectual disability -- use code to intellectual disability \n", + "severity_id = {'+ (Severe)': 'Intellectual disability, severe',\n", + " '+ (Moderate)': 'Intellectual disability, moderate',\n", + " '+ (Moderate-to-severe)':'Intellectual disability, moderate'}\n", + "idMapper = OptionColumnMapper(column_name='Developmental delay/intellectual disability',\n", + " concept_recognizer=hpo_cr, option_d=severity_id)\n", + "column_mapper_list.append(idMapper)\n", + "idMapper.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bafd7359", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
mappingcount
0+ (5 years) -> observed1
1+ (With walker) (3 years) -> observed1
2− -> observed2
3c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN42N3.2F...NondysmorphicNRNormalNRNaNNormalNormalMECP2, SMA3+ (With walker) -> observed1
4c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN39N3.48M...NRBroad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatumStrabismusNormalNRNaNNormalNormalNR4+ (4.5 years) -> observed1
5c.2509G>Ap.(Val837Met)g.73213379C>THeterozygousDe novoNaN38 + 3N3.378M...NRBroad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbsCryptorchidism, micropenis, bilateral talipes equinovarusVentriculomegaly, nonspecific periventricular white matter hyperintensities+ (Pain)NaNNormalN/ANR5+ (4 years) -> observed1
6+ (3.5 years) -> observed1
\n", - "

5 rows × 37 columns

\n", "
" ], "text/plain": [ - "Individual cDNA (NM_020952.4) Polypeptide (NP_066003.3) \\\n", - "1 c.2509G>A p.(Val837Met) \n", - "2 c.2509G>A p.(Val837Met) \n", - "3 c.2509G>A p.(Val837Met) \n", - "4 c.2509G>A p.(Val837Met) \n", - "5 c.2509G>A p.(Val837Met) \n", - "\n", - "Individual Genomic DNA (NC_000009.11) Zygosity Segregation \\\n", - "1 g.73213379C>T Heterozygous De novo \n", - "2 g.73213379C>T Heterozygous De novo \n", - "3 g.73213379C>T Heterozygous De novo \n", - "4 g.73213379C>T Heterozygous De novo \n", - "5 g.73213379C>T Heterozygous De novo \n", + " mapping count\n", + "0 + (5 years) -> observed 1\n", + "1 + (With walker) (3 years) -> observed 1\n", + "2 − -> observed 2\n", + "3 + (With walker) -> observed 1\n", + "4 + (4.5 years) -> observed 1\n", + "5 + (4 years) -> observed 1\n", + "6 + (3.5 years) -> observed 1" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# By inspection, all entries of this column indicate delayed ability to walk. Therefore, use ConstantColumnMapper\n", + "# the alternative would be to code each of the varied entries\n", + "delayedWalkColumn = ConstantColumnMapper(column_name='Ambulate independently (age achieved)',\n", + " hpo_id='HP:0031936', hpo_label='Delayed ability to walk')\n", + "column_mapper_list.append(delayedWalkColumn)\n", + "delayedWalkColumn.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9a64f2ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0+ (5 years) -> observed2
1− -> observed5
2+ (2.5 years) -> observed1
\n", + "
" + ], + "text/plain": [ + " mapping count\n", + "0 + (5 years) -> observed 2\n", + "1 − -> observed 5\n", + "2 + (2.5 years) -> observed 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Same comments for speech\n", + "delayedSpeechColumn = ConstantColumnMapper(column_name='Any speech (age attained)',\n", + " hpo_id='HP:0000750', hpo_label='Delayed speech and language development')\n", + "column_mapper_list.append(delayedSpeechColumn)\n", + "delayedSpeechColumn.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "de8ccd1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0original value: \"+ \" -> HP: Autistic behavior (HP:0000729) (observed)4
1original value: \"NR \" -> HP: Autistic behavior (HP:0000729) (not measured)2
2original value: \"− \" -> HP: Autistic behavior (HP:0000729) (excluded)2
\n", + "
" + ], + "text/plain": [ + " mapping \\\n", + "0 original value: \"+ \" -> HP: Autistic behavior (HP:0000729) (observed) \n", + "1 original value: \"NR \" -> HP: Autistic behavior (HP:0000729) (not measured) \n", + "2 original value: \"− \" -> HP: Autistic behavior (HP:0000729) (excluded) \n", "\n", - "Individual Brain MRI \\\n", - "1 Possible mild cerebral volume loss \n", - "2 Normal \n", - "3 Normal \n", - "4 Normal \n", - "5 Ventriculomegaly, nonspecific periventricular white matter hyperintensities \n", + " count \n", + "0 4 \n", + "1 2 \n", + "2 2 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## 'Autism-like features' # Autistic behavior HP:\n", + "autisticFeaturesMapper = SimpleColumnMapper(column_name='Autism-like features',\n", + " hpo_id='HP:0000729', hpo_label='Autistic behavior', observed=\"+\", excluded=\"−\")\n", + "column_mapper_list.append(autisticFeaturesMapper)\n", + "autisticFeaturesMapper.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f6aa015f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0Typical absence seizure (HP:0011147) (observed)4
1Infantile spasms (HP:0012469) (observed)1
2Bilateral tonic-clonic seizure (HP:0002069) (observed)2
3Status epilepticus (HP:0002133) (observed)1
\n", + "
" + ], + "text/plain": [ + " mapping count\n", + "0 Typical absence seizure (HP:0011147) (observed) 4\n", + "1 Infantile spasms (HP:0012469) (observed) 1\n", + "2 Bilateral tonic-clonic seizure (HP:0002069) (observed) 2\n", + "3 Status epilepticus (HP:0002133) (observed) 1" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seizure_d = {'Absence': 'Typical absence seizure',\n", + " 'Infantile spasms': 'Infantile spasms',\n", + " 'GTC':'Bilateral tonic-clonic seizure',\n", + " 'ESES': 'Status epilepticus'}\n", + "seizureMapper = OptionColumnMapper(column_name='Seizure types',\n", + " concept_recognizer=hpo_cr, option_d=seizure_d)\n", + "column_mapper_list.append(seizureMapper)\n", + "seizureMapper.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "41b668db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0original value: \"+ \" -> HP: Hypotonia (HP:0001252) (observed)6
1original value: \"− \" -> HP: Hypotonia (HP:0001252) (excluded)1
2original value: \"+ (mixed tone abnormality) \" -> HP: Hypotonia (HP:0001252) (observed)1
\n", + "
" + ], + "text/plain": [ + " mapping \\\n", + "0 original value: \"+ \" -> HP: Hypotonia (HP:0001252) (observed) \n", + "1 original value: \"− \" -> HP: Hypotonia (HP:0001252) (excluded) \n", + "2 original value: \"+ (mixed tone abnormality) \" -> HP: Hypotonia (HP:0001252) (observed) \n", + "\n", + " count \n", + "0 6 \n", + "1 1 \n", + "2 1 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Hypotonia HP:0001252 -- note that we include + (mixed tone abnormality) as Hypotonia\n", + "hypotoniaMapper = SimpleColumnMapper(column_name='Hypotonia',\n", + " hpo_id='HP:0001252', hpo_label='Hypotonia', \n", + " observed=['+', '+ (mixed tone abnormality)'], excluded='−')\n", + "column_mapper_list.append(hypotoniaMapper)\n", + "hypotoniaMapper.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0d1e3050", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0Broad forehead (HP:0000337) (observed)4
1Ptosis (HP:0000508) (observed)2
2Bulbous nose (HP:0000414) (observed)2
3Micrognathia (HP:0000347) (observed)4
4Tapered finger (HP:0001182) (observed)1
5Short philtrum (HP:0000322) (observed)3
6Long nose (HP:0003189) (observed)1
7Turricephaly (HP:0000262) (observed)1
8Midface retrusion (HP:0011800) (observed)1
9Broad hallux (HP:0010055) (observed)1
10Finger clinodactyly (HP:0040019) (observed)1
11Pectus excavatum (HP:0000767) (observed)1
12Depressed nasal bridge (HP:0005280) (observed)1
13Preauricular pit (HP:0004467) (observed)1
14Broad thumb (HP:0011304) (observed)1
15High palate (HP:0000218) (observed)1
16Facial asymmetry (HP:0000324) (observed)1
17Telecanthus (HP:0000506) (observed)1
18Short neck (HP:0000470) (observed)1
19Upslanted palpebral fissure (HP:0000582) (observed)1
20Anteverted nares (HP:0000463) (observed)1
21Wide mouth (HP:0000154) (observed)1
22Facial capillary hemangioma (HP:0000996) (observed)1
\n", + "
" + ], + "text/plain": [ + " mapping count\n", + "0 Broad forehead (HP:0000337) (observed) 4\n", + "1 Ptosis (HP:0000508) (observed) 2\n", + "2 Bulbous nose (HP:0000414) (observed) 2\n", + "3 Micrognathia (HP:0000347) (observed) 4\n", + "4 Tapered finger (HP:0001182) (observed) 1\n", + "5 Short philtrum (HP:0000322) (observed) 3\n", + "6 Long nose (HP:0003189) (observed) 1\n", + "7 Turricephaly (HP:0000262) (observed) 1\n", + "8 Midface retrusion (HP:0011800) (observed) 1\n", + "9 Broad hallux (HP:0010055) (observed) 1\n", + "10 Finger clinodactyly (HP:0040019) (observed) 1\n", + "11 Pectus excavatum (HP:0000767) (observed) 1\n", + "12 Depressed nasal bridge (HP:0005280) (observed) 1\n", + "13 Preauricular pit (HP:0004467) (observed) 1\n", + "14 Broad thumb (HP:0011304) (observed) 1\n", + "15 High palate (HP:0000218) (observed) 1\n", + "16 Facial asymmetry (HP:0000324) (observed) 1\n", + "17 Telecanthus (HP:0000506) (observed) 1\n", + "18 Short neck (HP:0000470) (observed) 1\n", + "19 Upslanted palpebral fissure (HP:0000582) (observed) 1\n", + "20 Anteverted nares (HP:0000463) (observed) 1\n", + "21 Wide mouth (HP:0000154) (observed) 1\n", + "22 Facial capillary hemangioma (HP:0000996) (observed) 1" ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Convert to row based format\n", - "dft = df.transpose()\n", - "dft.columns = dft.iloc[0]\n", - "dft.drop(dft.index[0], inplace=True)\n", - "dft.head()\n", - "# Note that the Individual is now the row index but we need it to be available as a column\n", - "# Therefore, add it as an explicit, new column\n", - "dft['patient_id'] = dft.index\n", - "dft.head()" - ] - }, - { - "cell_type": "markdown", - "id": "ccff0a96", - "metadata": {}, - "source": [ - "

Column mappers

" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "780039a6", - "metadata": {}, - "outputs": [], - "source": [ - "column_mapper_d = defaultdict(ColumnMapper)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fbceaf13", - "metadata": {}, - "outputs": [], - "source": [ - "# Developmental delay/intellectual disability -- use code to intellectual disability \n", - "severity_id = {'+ (Severe)': 'Intellectual disability, severe',\n", - " '+ (Moderate)': 'Intellectual disability, moderate',\n", - " '+ (Moderate-to-severe)':'Intellectual disability, moderate'}\n", - "idMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=severity_id)\n", - "idMapper.preview_column(dft['Developmental delay/intellectual disability'])\n", - "column_mapper_d['Developmental delay/intellectual disability'] = idMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "fb4d0d5a", - "metadata": {}, - "outputs": [], - "source": [ - "# when building the encoder, inspect all the columns\n", - "# dft.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bafd7359", - "metadata": {}, - "outputs": [], - "source": [ - "# By inspection, all entries of this column indicate delayed ability to walk. Therefore, use ConstantColumnMapper\n", - "# the alternative would be to code each of the varied entries\n", - "delayedWalkColumn = ConstantColumnMapper(hpo_id='HP:0031936', hpo_label='Delayed ability to walk')\n", - "#delayedWalkColumn.preview_column(dft['Ambulate independently (age achieved)'])\n", - "column_mapper_d['Ambulate independently (age achieved)'] = delayedWalkColumn" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "9a64f2ea", - "metadata": {}, - "outputs": [], - "source": [ - "## Same comments for speech\n", - "delayedSpeechColumn = ConstantColumnMapper(hpo_id='HP:0000750', hpo_label='Delayed speech and language development')\n", - "# delayedSpeechColumn.preview_column(dft['Any speech (age attained)'])\n", - "column_mapper_d['Any speech (age attained)'] = delayedSpeechColumn" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "de8ccd1d", - "metadata": {}, - "outputs": [], - "source": [ - "## 'Autism-like features' # Autistic behavior HP:\n", - "autisticFeaturesMapper = SimpleColumnMapper(hpo_id='HP:0000729', hpo_label='Autistic behavior', observed=\"+\", excluded=\"−\")\n", - "#autisticFeaturesMapper.preview_column(dft['Autism-like features'])\n", - "column_mapper_d['Autism-like features'] = autisticFeaturesMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "f6aa015f", - "metadata": {}, - "outputs": [], - "source": [ - "seizure_d = {'Absence': 'Typical absence seizure',\n", - " 'Infantile spasms': 'Infantile spasms',\n", - " 'GTC':'Bilateral tonic-clonic seizure',\n", - " 'ESES': 'Status epilepticus'}\n", - "seizureMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=seizure_d)\n", - "#seizureMapper.preview_column(dft['Seizure types'])\n", - "column_mapper_d['Seizure types'] = seizureMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "41b668db", - "metadata": {}, - "outputs": [], - "source": [ - "# Hypotonia HP:0001252 -- note that we include + (mixed tone abnormality) as Hypotonia\n", - "hypotoniaMapper = SimpleColumnMapper(hpo_id='HP:0001252', hpo_label='Hypotonia', \n", - " observed=['+', '+ (mixed tone abnormality)'], excluded='−')\n", - "#hypotoniaMapper.preview_column(dft['Hypotonia'])\n", - "column_mapper_d['Hypotonia'] = hypotoniaMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "0d1e3050", - "metadata": {}, - "outputs": [], - "source": [ - "#dft['Morphological features']\n", "morph_d = {\n", " 'bulbous nasal tip': 'Bulbous nose'\n", "}\n", - "morphologicalMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=morph_d)\n", - "morphologicalMapper.preview_column(dft['Morphological features'])\n", - "column_mapper_d['Morphological features'] = morphologicalMapper" + "morphologicalMapper = OptionColumnMapper(column_name='Morphological features',\n", + " concept_recognizer=hpo_cr, option_d=morph_d)\n", + "column_mapper_list.append(morphologicalMapper)\n", + "morphologicalMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "e439c0a0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mappingcount
0Cervical spinal canal stenosis (HP:0008445) (observed)1
1Chiari type I malformation (HP:0007099) (observed)1
2Scoliosis (HP:0002650) (observed)3
3Torticollis (HP:0000473) (observed)1
4Plagiocephaly (HP:0001357) (observed)1
5Bilateral talipes equinovarus (HP:0001776) (observed)2
6Exotropia (HP:0000577) (observed)2
7Strabismus (HP:0000486) (observed)4
8Cryptorchidism (HP:0000028) (observed)1
9Micropenis (HP:0000054) (observed)1
10Neonatal hypoglycemia (HP:0001998) (observed)1
11Hip dysplasia (HP:0001385) (observed)1
12Athetosis (HP:0002305) (observed)1
13Pes planus (HP:0001763) (observed)1
14Choreoathetosis (HP:0001266) (observed)1
\n", + "
" + ], + "text/plain": [ + " mapping count\n", + "0 Cervical spinal canal stenosis (HP:0008445) (observed) 1\n", + "1 Chiari type I malformation (HP:0007099) (observed) 1\n", + "2 Scoliosis (HP:0002650) (observed) 3\n", + "3 Torticollis (HP:0000473) (observed) 1\n", + "4 Plagiocephaly (HP:0001357) (observed) 1\n", + "5 Bilateral talipes equinovarus (HP:0001776) (observed) 2\n", + "6 Exotropia (HP:0000577) (observed) 2\n", + "7 Strabismus (HP:0000486) (observed) 4\n", + "8 Cryptorchidism (HP:0000028) (observed) 1\n", + "9 Micropenis (HP:0000054) (observed) 1\n", + "10 Neonatal hypoglycemia (HP:0001998) (observed) 1\n", + "11 Hip dysplasia (HP:0001385) (observed) 1\n", + "12 Athetosis (HP:0002305) (observed) 1\n", + "13 Pes planus (HP:0001763) (observed) 1\n", + "14 Choreoathetosis (HP:0001266) (observed) 1" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "other_d = {\n", " 'Chiari I malformation': 'Chiari type I malformation',\n", " 'C1 spinal stenosis':'Cervical spinal canal stenosis'\n", "}\n", - "otherMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=other_d)\n", - "otherMapper.preview_column(dft['Other clinical features'])\n", - "column_mapper_d['Other clinical features'] = otherMapper" + "otherMapper = OptionColumnMapper(column_name='Other clinical features',\n", + " concept_recognizer=hpo_cr, option_d=other_d)\n", + "column_mapper_list.append(otherMapper)\n", + "otherMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "id": "1c981b39", "metadata": {}, "outputs": [], "source": [ "ageMapper = AgeColumnMapper.by_year('Age (years)')\n", - "#ageMapper.preview_column(dft['Age (years)'])\n", + "#ageMapper.preview_column(dft)\n", "sexMapper = SexColumnMapper(male_symbol='M', female_symbol='F', column_name='Sex')\n", - "#sexMapper.preview_column(dft['Sex'])" + "#sexMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "b375cdb8-5775-4ba6-a424-4f2d3b3f02a4", + "execution_count": 15, + "id": "fce8583c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A /NM_020952.6?content-type=application%2Fjson\n", - "https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2810C>A /NM_020952.6?content-type=application%2Fjson\n" - ] - } - ], + "outputs": [], "source": [ - "hg38 = 'hg38'\n", "TRPM3_transcript='NM_020952.6'\n", - "vvalidator = VariantValidator(genome_build=hg38, transcript=TRPM3_transcript)\n", - "var_list = dft['cDNA (NM_020952.4) '].unique()\n", - "var_d = {}\n", - "for v in var_list:\n", - " var = vvalidator.encode_hgvs(v)\n", - " var_d[v] = var" + "TRPM3_id = \"HGNC:17992\"\n", + "vman = VariantManager(df=dft, individual_column_name=\"individual_id\", gene_id=TRPM3_id, gene_symbol=\"TRPM3\", transcript=TRPM3_transcript,allele_1_column_name='cDNA (NM_020952.4) ')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "d0ca245c", "metadata": {}, "outputs": [], "source": [ - "genome = 'hg38'\n", - "transcript='NM_020952.6' # latest version of TRPM3 transcript used in publlication (original: version 4)\n", "# Note there is an extra space at the end of the column name\n", - "varMapper = VariantColumnMapper(variant_d=var_d,\n", + "varMapper = VariantColumnMapper(variant_d=vman.get_variant_d(),\n", " variant_column_name='cDNA (NM_020952.4) ', \n", " default_genotype='heterozygous')\n", "#varMapper.preview_column(column=dft['cDNA (NM_020952.4) '])" @@ -1335,20 +1299,19 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "id": "07912022", "metadata": {}, "outputs": [], "source": [ "encoder = CohortEncoder(df=dft, \n", " hpo_cr=hpo_cr, \n", - " column_mapper_d=column_mapper_d, \n", - " individual_column_name=\"patient_id\", \n", - " agemapper=ageMapper, \n", + " column_mapper_list=column_mapper_list, \n", + " individual_column_name=\"individual_id\", \n", + " age_at_last_encounter_mapper=ageMapper, \n", " sexmapper=sexMapper,\n", " variant_mapper=varMapper,\n", - " metadata=metadata,\n", - " pmid=PMID)\n", + " metadata=metadata)\n", "omim_id = \"OMIM:620224\"\n", "omim_label = \"Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures\"\n", "disease = Disease(disease_id=omim_id, disease_label=omim_label)\n", @@ -1357,7 +1320,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 18, "id": "314dfd69", "metadata": {}, "outputs": [], @@ -1377,22 +1340,36 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, "id": "b185fed2-20c2-44a0-bfd3-075d12d53ecc", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "We found 0 validation errors\n" - ] + "data": { + "text/html": [ + "

Cohort validation

\n", + "

Errors found with 3 of 8 phenopackets.

\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Error counts
LevelError categoryCount
WARNINGREDUNDANT2
INFORMATIONNOT_MEASURED2
\n", + "

A total of 4 issues were fixed and no individual was removed from the cohort.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "cvalidator = ContentValidator(min_var=1, min_hpo=1)\n", - "errors = cvalidator.validate_phenopacket_list([i.to_ga4gh_phenopacket(metadata.to_ga4gh()) for i in individuals])\n", - "print(f\"We found {len(errors)} validation errors\")" + "cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1,\n", + " allelic_requirement=AllelicRequirement.MONO_ALLELIC)\n", + "qc = QcVisualizer(cohort_validator=cvalidator)\n", + "display(HTML(qc.to_summary_html()))" ] }, { @@ -1406,73 +1383,25 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "id": "b30143b6-7ea6-4196-875a-b2718d382a5b", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\n", - "\n", + "
\n", "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
8 phenopackets - PMID:31278393 (n=8)
IndividualDiseaseGenotypePhenotypic features
1 (MALE; P16Y)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252); Bulbous nose (HP:0000414); Cervical spinal canal stenosis (HP:0008445); Chiari type I malformation (HP:0007099)
2 (MALE; P4Y9M)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Infantile spasms (HP:0012469); Hypotonia (HP:0001252)
3 (FEMALE; P6Y)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Bilateral tonic-clonic seizure (HP:0002069); Hypotonia (HP:0001252)
4 (MALE; P5Y11M)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Status epilepticus (HP:0002133); Hypotonia (HP:0001252)
5 (MALE; P6Y3M)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Hypotonia (HP:0001252)
6 (MALE; P28Y)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Bilateral tonic-clonic seizure (HP:0002069); Typical absence seizure (HP:0011147)
7 (MALE; P38Y)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252); Bulbous nose (HP:0000414)
8 (FEMALE; P8Y1M)\n", - "Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)\n", - "NM_020952.6:c.2810C>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252)
\n" + "IndividualDiseaseGenotypePhenotypic features\n", + "1 (MALE; P16Y)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252); Broad forehead (HP:0000337); Ptosis (HP:0000508); Bulbous nose (HP:0000414); Micrognathia (HP:0000347); Tapered finger (HP:0001182); Cervical spinal canal stenosis (HP:0008445); Chiari type I malformation (HP:0007099); Scoliosis (HP:0002650); Torticollis (HP:0000473); Plagiocephaly (HP:0001357); Bilateral talipes equinovarus (HP:0001776); Exotropia (HP:0000577)\n", + "2 (MALE; P4Y9M)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Infantile spasms (HP:0012469); Hypotonia (HP:0001252); Short philtrum (HP:0000322); Long nose (HP:0003189); Turricephaly (HP:0000262)\n", + "3 (FEMALE; P6Y)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Bilateral tonic-clonic seizure (HP:0002069); Hypotonia (HP:0001252)\n", + "4 (MALE; P5Y11M)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Status epilepticus (HP:0002133); Hypotonia (HP:0001252); Broad forehead (HP:0000337); Midface retrusion (HP:0011800); Short philtrum (HP:0000322); Micrognathia (HP:0000347); Broad hallux (HP:0010055); Finger clinodactyly (HP:0040019); Pectus excavatum (HP:0000767); Strabismus (HP:0000486)\n", + "5 (MALE; P6Y3M)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Autistic behavior (HP:0000729); Hypotonia (HP:0001252); Broad forehead (HP:0000337); Depressed nasal bridge (HP:0005280); Preauricular pit (HP:0004467); Broad thumb (HP:0011304); Cryptorchidism (HP:0000028); Micropenis (HP:0000054); Bilateral talipes equinovarus (HP:0001776)\n", + "6 (MALE; P28Y)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, severe (HP:0010864); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Bilateral tonic-clonic seizure (HP:0002069); Typical absence seizure (HP:0011147); Micrognathia (HP:0000347); High palate (HP:0000218); Neonatal hypoglycemia (HP:0001998); Hip dysplasia (HP:0001385); Scoliosis (HP:0002650); excluded: Autistic behavior (HP:0000729); excluded: Hypotonia (HP:0001252)\n", + "7 (MALE; P38Y)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2509G>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252); Facial asymmetry (HP:0000324); Ptosis (HP:0000508); Telecanthus (HP:0000506); Bulbous nose (HP:0000414); Micrognathia (HP:0000347); Short neck (HP:0000470); Exotropia (HP:0000577); Athetosis (HP:0002305); Pes planus (HP:0001763)\n", + "8 (FEMALE; P8Y1M)Neurodevelopmental disorder with hypotonia, dysmorphic facies, and skeletal anomalies, with or without seizures (OMIM:620224)NM_020952.6:c.2810C>A (heterozygous)Intellectual disability, moderate (HP:0002342); Delayed ability to walk (HP:0031936); Delayed speech and language development (HP:0000750); Typical absence seizure (HP:0011147); Hypotonia (HP:0001252); Broad forehead (HP:0000337); Upslanted palpebral fissure (HP:0000582); Anteverted nares (HP:0000463); Short philtrum (HP:0000322); Wide mouth (HP:0000154); Facial capillary hemangioma (HP:0000996); Choreoathetosis (HP:0001266); Strabismus (HP:0000486); Scoliosis (HP:0002650); excluded: Autistic behavior (HP:0000729)\n", + "" ], "text/plain": [ "" @@ -1483,9 +1412,8 @@ } ], "source": [ - "from IPython.display import HTML, display\n", - "phenopackets = [i.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh()) for i in individuals]\n", - "table = PhenopacketTable(phenopacket_list=phenopackets)\n", + "individuals = cvalidator.get_error_free_individual_list()\n", + "table = IndividualTable(individuals)\n", "display(HTML(table.to_html()))" ] }, @@ -1499,7 +1427,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "c50e59f1", "metadata": {}, "outputs": [ @@ -1514,8 +1442,7 @@ "source": [ "output_directory = \"phenopackets\"\n", "Individual.output_individuals_as_phenopackets(individual_list=individuals,\n", - " metadata=metadata.to_ga4gh(),\n", - " pmid=PMID,\n", + " metadata=metadata,\n", " outdir=output_directory)" ] }, @@ -1530,9 +1457,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ppt_venv", + "display_name": "Python 3", "language": "python", - "name": "ppt_venv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1544,7 +1471,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/notebooks/Example_2_PMID_30945334_MAPK8IP3.ipynb b/notebooks/Example_2_PMID_30945334_MAPK8IP3.ipynb index f5f885e4..a2d33dcc 100644 --- a/notebooks/Example_2_PMID_30945334_MAPK8IP3.ipynb +++ b/notebooks/Example_2_PMID_30945334_MAPK8IP3.ipynb @@ -22,7 +22,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Using pyphetools version 0.7.1\n" + "Using pyphetools version 0.9.77\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/robin/GIT/pyphetools/ppt_venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" ] } ], @@ -30,12 +38,13 @@ "import phenopackets as php\n", "from google.protobuf.json_format import MessageToDict, MessageToJson\n", "from google.protobuf.json_format import Parse, ParseDict\n", + "from IPython.display import display, HTML\n", "import pandas as pd\n", "pd.set_option('display.max_colwidth', None) # show entire column contents, important!\n", "from collections import defaultdict\n", "import pyphetools\n", "from pyphetools.creation import *\n", - "from pyphetools.validation import ContentValidator\n", + "from pyphetools.validation import CohortValidator\n", "from pyphetools.visualization import *\n", "\n", "print(f\"Using pyphetools version {pyphetools.__version__}\")" @@ -59,11 +68,13 @@ "outputs": [], "source": [ "parser = HpoParser()\n", + "hp_ontology = parser.get_ontology()\n", "hpo_cr = parser.get_hpo_concept_recognizer()\n", "hpo_version = parser.get_version()\n", "PMID = \"PMID:30945334\"\n", "title = \"Recurrent de novo MAPK8IP3 variants cause neurological phenotypes\"\n", - "metadata = MetaData(created_by=\"ORCID:0000-0002-0736-9199\", pmid=PMID, pubmed_title=title)\n", + "cite = Citation(pmid=PMID, title=title)\n", + "metadata = MetaData(created_by=\"ORCID:0000-0002-0736-9199\", citation=cite)\n", "metadata.default_versions_with_hpo(version=hpo_version)" ] }, @@ -141,514 +152,18 @@ " (p.Arg1146Cys)\n", " (p.Arg1146Cys)\n", " \n", - " \n", - " 2\n", - " Age (yr)\n", - " 29\n", - " 27\n", - " 16\n", - " 5\n", - " 5\n", - " \n", - " \n", - " 3\n", - " Sex\n", - " Male\n", - " Female\n", - " Male\n", - " Male\n", - " Female\n", - " \n", - " \n", - " 4\n", - " Gestational ages (weeks)\n", - " 39\n", - " 40\n", - " 40\n", - " 36\n", - " 41\n", - " \n", - " \n", - " 5\n", - " Delayed motor development\n", - " +\n", - " +\n", - " +\n", - " +\n", - " +\n", - " \n", - " \n", - " 6\n", - " Age at head control (months)\n", - " 2.5\n", - " 3.5\n", - " 4\n", - " 5\n", - " 5\n", - " \n", - " \n", - " 7\n", - " Age at rolling (months)\n", - " ND\n", - " 11\n", - " 6\n", - " 7\n", - " 6\n", - " \n", - " \n", - " 8\n", - " Age at unsupported sitting (months)\n", - " 7\n", - " 6\n", - " Not acquired\n", - " 15\n", - " 11\n", - " \n", - " \n", - " 9\n", - " Age at crawling (months)\n", - " Not acquired\n", - " 11\n", - " ND\n", - " 18\n", - " 18\n", - " \n", - " \n", - " 10\n", - " Age at walking (months)\n", - " Not acquired\n", - " Not acquired\n", - " Not acquired\n", - " Not acquired\n", - " 48\n", - " \n", - " \n", - " 11\n", - " Intellectual disability\n", - " Severe\n", - " Severe\n", - " Profound\n", - " Severe\n", - " Severe\n", - " \n", - " \n", - " 12\n", - " Autistic behavior\n", - " −\n", - " −\n", - " −\n", - " +\n", - " +\n", - " \n", - " \n", - " 13\n", - " Language comprehension\n", - " +\n", - " +\n", - " +\n", - " −\n", - " −\n", - " \n", - " \n", - " 14\n", - " Language skills\n", - " Simple two-word sentences\n", - " Simple two-word sentences\n", - " Simple words\n", - " Nonverbal\n", - " Nonverbal\n", - " \n", - " \n", - " 15\n", - " Spastic diplegia\n", - " +\n", - " +\n", - " +\n", - " −\n", - " +\n", - " \n", - " \n", - " 16\n", - " Gross motor skills\n", - " Wheelchair bound\n", - " Wheelchair bound\n", - " Wheelchair bound\n", - " Cruising\n", - " Walking\n", - " \n", - " \n", - " 17\n", - " Infantile hypotonia\n", - " −\n", - " −\n", - " −\n", - " +\n", - " +\n", - " \n", - " \n", - " 18\n", - " History of regression\n", - " −\n", - " −\n", - " −\n", - " −\n", - " −\n", - " \n", - " \n", - " 19\n", - " Epilepsy\n", - " −\n", - " +\n", - " +\n", - " −\n", - " −\n", - " \n", - " \n", - " 20\n", - " EEG\n", - " High-voltage with slow waves with spikes\n", - " High-voltage with slow waves with spikes\n", - " 3-Hz spike-wave discharges on the right frontal quadrant during sleep\n", - " Normal\n", - " ND\n", - " \n", - " \n", - " 21\n", - " Brain MRI\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 22\n", - " Cerebral atrophy\n", - " +\n", - " +\n", - " + (mild, right hemisphere)\n", - " +\n", - " +\n", - " \n", - " \n", - " 23\n", - " Delayed myelination\n", - " +\n", - " +\n", - " −\n", - " −\n", - " +\n", - " \n", - " \n", - " 24\n", - " Corpus callosum hypoplasia\n", - " ++\n", - " ++\n", - " ++\n", - " ++\n", - " ++\n", - " \n", - " \n", - " 25\n", - " Facial dysmorphism\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 26\n", - " Round face\n", - " +\n", - " +\n", - " −\n", - " +\n", - " +\n", - " \n", - " \n", - " 27\n", - " Prominent nasal bridge\n", - " −\n", - " −\n", - " +\n", - " +\n", - " +\n", - " \n", - " \n", - " 28\n", - " Thin upper lip\n", - " +\n", - " +\n", - " +\n", - " +\n", - " +\n", - " \n", - " \n", - " 29\n", - " Others\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum\n", - " \n", - " \n", - " 30\n", - " Other\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 31\n", - " Short stature\n", - " +\n", - " +\n", - " +\n", - " +\n", - " −\n", - " \n", - " \n", - " 32\n", - " Obesity\n", - " +\n", - " +\n", - " +\n", - " −\n", - " −\n", - " \n", - " \n", - " 33\n", - " Precocious puberty\n", - " +\n", - " +\n", - " ND\n", - " −\n", - " −\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " identifier \\\n", - "0 Variant (hg19, NM_015133.4) \n", - "1 Protein variant \n", - "2 Age (yr) \n", - "3 Sex \n", - "4 Gestational ages (weeks) \n", - "5 Delayed motor development \n", - "6 Age at head control (months) \n", - "7 Age at rolling (months) \n", - "8 Age at unsupported sitting (months) \n", - "9 Age at crawling (months) \n", - "10 Age at walking (months) \n", - "11 Intellectual disability \n", - "12 Autistic behavior \n", - "13 Language comprehension \n", - "14 Language skills \n", - "15 Spastic diplegia \n", - "16 Gross motor skills \n", - "17 Infantile hypotonia \n", - "18 History of regression \n", - "19 Epilepsy \n", - "20 EEG \n", - "21 Brain MRI \n", - "22 Cerebral atrophy \n", - "23 Delayed myelination \n", - "24 Corpus callosum hypoplasia \n", - "25 Facial dysmorphism \n", - "26 Round face \n", - "27 Prominent nasal bridge \n", - "28 Thin upper lip \n", - "29 Others \n", - "30 Other \n", - "31 Short stature \n", - "32 Obesity \n", - "33 Precocious puberty \n", - "\n", - " Individual 1 \\\n", - "0 c.1732C>T \n", - "1 (p.Arg578Cys) \n", - "2 29 \n", - "3 Male \n", - "4 39 \n", - "5 + \n", - "6 2.5 \n", - "7 ND \n", - "8 7 \n", - "9 Not acquired \n", - "10 Not acquired \n", - "11 Severe \n", - "12 − \n", - "13 + \n", - "14 Simple two-word sentences \n", - "15 + \n", - "16 Wheelchair bound \n", - "17 − \n", - "18 − \n", - "19 − \n", - "20 High-voltage with slow waves with spikes \n", - "21 NaN \n", - "22 + \n", - "23 + \n", - "24 ++ \n", - "25 NaN \n", - "26 + \n", - "27 − \n", - "28 + \n", - "29 NaN \n", - "30 NaN \n", - "31 + \n", - "32 + \n", - "33 + \n", - "\n", - " Individual 2 \\\n", - "0 c.1732C>T \n", - "1 (p.Arg578Cys) \n", - "2 27 \n", - "3 Female \n", - "4 40 \n", - "5 + \n", - "6 3.5 \n", - "7 11 \n", - "8 6 \n", - "9 11 \n", - "10 Not acquired \n", - "11 Severe \n", - "12 − \n", - "13 + \n", - "14 Simple two-word sentences \n", - "15 + \n", - "16 Wheelchair bound \n", - "17 − \n", - "18 − \n", - "19 + \n", - "20 High-voltage with slow waves with spikes \n", - "21 NaN \n", - "22 + \n", - "23 + \n", - "24 ++ \n", - "25 NaN \n", - "26 + \n", - "27 − \n", - "28 + \n", - "29 NaN \n", - "30 NaN \n", - "31 + \n", - "32 + \n", - "33 + \n", + " identifier Individual 1 Individual 2 Individual 3 \\\n", + "0 Variant (hg19, NM_015133.4) c.1732C>T c.1732C>T c.1732C>T \n", + "1 Protein variant (p.Arg578Cys) (p.Arg578Cys) (p.Arg578Cys) \n", "\n", - " Individual 3 \\\n", - "0 c.1732C>T \n", - "1 (p.Arg578Cys) \n", - "2 16 \n", - "3 Male \n", - "4 40 \n", - "5 + \n", - "6 4 \n", - "7 6 \n", - "8 Not acquired \n", - "9 ND \n", - "10 Not acquired \n", - "11 Profound \n", - "12 − \n", - "13 + \n", - "14 Simple words \n", - "15 + \n", - "16 Wheelchair bound \n", - "17 − \n", - "18 − \n", - "19 + \n", - "20 3-Hz spike-wave discharges on the right frontal quadrant during sleep \n", - "21 NaN \n", - "22 + (mild, right hemisphere) \n", - "23 − \n", - "24 ++ \n", - "25 NaN \n", - "26 − \n", - "27 + \n", - "28 + \n", - "29 NaN \n", - "30 NaN \n", - "31 + \n", - "32 + \n", - "33 ND \n", - "\n", - " Individual 4 \\\n", - "0 c.3436C>T \n", - "1 (p.Arg1146Cys) \n", - "2 5 \n", - "3 Male \n", - "4 36 \n", - "5 + \n", - "6 5 \n", - "7 7 \n", - "8 15 \n", - "9 18 \n", - "10 Not acquired \n", - "11 Severe \n", - "12 + \n", - "13 − \n", - "14 Nonverbal \n", - "15 − \n", - "16 Cruising \n", - "17 + \n", - "18 − \n", - "19 − \n", - "20 Normal \n", - "21 NaN \n", - "22 + \n", - "23 − \n", - "24 ++ \n", - "25 NaN \n", - "26 + \n", - "27 + \n", - "28 + \n", - "29 NaN \n", - "30 NaN \n", - "31 + \n", - "32 − \n", - "33 − \n", - "\n", - " Individual 5 \n", - "0 c.3436C>T \n", - "1 (p.Arg1146Cys) \n", - "2 5 \n", - "3 Female \n", - "4 41 \n", - "5 + \n", - "6 5 \n", - "7 6 \n", - "8 11 \n", - "9 18 \n", - "10 48 \n", - "11 Severe \n", - "12 + \n", - "13 − \n", - "14 Nonverbal \n", - "15 + \n", - "16 Walking \n", - "17 + \n", - "18 − \n", - "19 − \n", - "20 ND \n", - "21 NaN \n", - "22 + \n", - "23 + \n", - "24 ++ \n", - "25 NaN \n", - "26 + \n", - "27 + \n", - "28 + \n", - "29 Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum \n", - "30 NaN \n", - "31 − \n", - "32 − \n", - "33 − " + " Individual 4 Individual 5 \n", + "0 c.3436C>T c.3436C>T \n", + "1 (p.Arg1146Cys) (p.Arg1146Cys) " ] }, "execution_count": 4, @@ -657,7 +172,7 @@ } ], "source": [ - "df" + "df.head(2)" ] }, { @@ -733,7 +248,7 @@ " 2.5\n", " ND\n", " 7\n", - " Not acquired\n", + " 468\n", " ...\n", " ++\n", " NaN\n", @@ -770,141 +285,41 @@ " +\n", " +\n", " \n", - " \n", - " Individual 3\n", - " c.1732C>T\n", - " (p.Arg578Cys)\n", - " 16\n", - " Male\n", - " 40\n", - " +\n", - " 4\n", - " 6\n", - " Not acquired\n", - " ND\n", - " ...\n", - " ++\n", - " NaN\n", - " −\n", - " +\n", - " +\n", - " NaN\n", - " NaN\n", - " +\n", - " +\n", - " ND\n", - " \n", - " \n", - " Individual 4\n", - " c.3436C>T\n", - " (p.Arg1146Cys)\n", - " 5\n", - " Male\n", - " 36\n", - " +\n", - " 5\n", - " 7\n", - " 15\n", - " 18\n", - " ...\n", - " ++\n", - " NaN\n", - " +\n", - " +\n", - " +\n", - " NaN\n", - " NaN\n", - " +\n", - " −\n", - " −\n", - " \n", - " \n", - " Individual 5\n", - " c.3436C>T\n", - " (p.Arg1146Cys)\n", - " 5\n", - " Female\n", - " 41\n", - " +\n", - " 5\n", - " 6\n", - " 11\n", - " 18\n", - " ...\n", - " ++\n", - " NaN\n", - " +\n", - " +\n", - " +\n", - " Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum\n", - " NaN\n", - " −\n", - " −\n", - " −\n", - " \n", " \n", "\n", - "

5 rows × 34 columns

\n", + "

2 rows × 34 columns

\n", "" ], "text/plain": [ "identifier Variant (hg19, NM_015133.4) Protein variant Age (yr) Sex \\\n", "Individual 1 c.1732C>T (p.Arg578Cys) 29 Male \n", "Individual 2 c.1732C>T (p.Arg578Cys) 27 Female \n", - "Individual 3 c.1732C>T (p.Arg578Cys) 16 Male \n", - "Individual 4 c.3436C>T (p.Arg1146Cys) 5 Male \n", - "Individual 5 c.3436C>T (p.Arg1146Cys) 5 Female \n", "\n", "identifier Gestational ages (weeks) Delayed motor development \\\n", "Individual 1 39 + \n", "Individual 2 40 + \n", - "Individual 3 40 + \n", - "Individual 4 36 + \n", - "Individual 5 41 + \n", "\n", "identifier Age at head control (months) Age at rolling (months) \\\n", "Individual 1 2.5 ND \n", "Individual 2 3.5 11 \n", - "Individual 3 4 6 \n", - "Individual 4 5 7 \n", - "Individual 5 5 6 \n", "\n", "identifier Age at unsupported sitting (months) Age at crawling (months) \\\n", - "Individual 1 7 Not acquired \n", + "Individual 1 7 468 \n", "Individual 2 6 11 \n", - "Individual 3 Not acquired ND \n", - "Individual 4 15 18 \n", - "Individual 5 11 18 \n", "\n", "identifier ... Corpus callosum hypoplasia Facial dysmorphism Round face \\\n", "Individual 1 ... ++ NaN + \n", "Individual 2 ... ++ NaN + \n", - "Individual 3 ... ++ NaN − \n", - "Individual 4 ... ++ NaN + \n", - "Individual 5 ... ++ NaN + \n", - "\n", - "identifier Prominent nasal bridge Thin upper lip \\\n", - "Individual 1 − + \n", - "Individual 2 − + \n", - "Individual 3 + + \n", - "Individual 4 + + \n", - "Individual 5 + + \n", "\n", - "identifier Others \\\n", - "Individual 1 NaN \n", - "Individual 2 NaN \n", - "Individual 3 NaN \n", - "Individual 4 NaN \n", - "Individual 5 Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum \n", + "identifier Prominent nasal bridge Thin upper lip Others Other Short stature \\\n", + "Individual 1 − + NaN NaN + \n", + "Individual 2 − + NaN NaN + \n", "\n", - "identifier Other Short stature Obesity Precocious puberty \n", - "Individual 1 NaN + + + \n", - "Individual 2 NaN + + + \n", - "Individual 3 NaN + + ND \n", - "Individual 4 NaN + − − \n", - "Individual 5 NaN − − − \n", + "identifier Obesity Precocious puberty \n", + "Individual 1 + + \n", + "Individual 2 + + \n", "\n", - "[5 rows x 34 columns]" + "[2 rows x 34 columns]" ] }, "execution_count": 5, @@ -917,7 +332,7 @@ "\n", "dft.columns = dft.iloc[0]\n", "dft.drop(dft.index[0], inplace=True)\n", - "dft.head()" + "dft.head(2)" ] }, { @@ -977,7 +392,7 @@ " Short stature\n", " Obesity\n", " Precocious puberty\n", - " patient_id\n", + " individual_id\n", " \n", " \n", " \n", @@ -992,7 +407,7 @@ " 2.5\n", " ND\n", " 7\n", - " Not acquired\n", + " 468\n", " ...\n", " NaN\n", " +\n", @@ -1029,141 +444,41 @@ " +\n", " Individual 2\n", " \n", - " \n", - " Individual 3\n", - " c.1732C>T\n", - " (p.Arg578Cys)\n", - " 16\n", - " Male\n", - " 40\n", - " +\n", - " 4\n", - " 6\n", - " Not acquired\n", - " ND\n", - " ...\n", - " NaN\n", - " −\n", - " +\n", - " +\n", - " NaN\n", - " NaN\n", - " +\n", - " +\n", - " ND\n", - " Individual 3\n", - " \n", - " \n", - " Individual 4\n", - " c.3436C>T\n", - " (p.Arg1146Cys)\n", - " 5\n", - " Male\n", - " 36\n", - " +\n", - " 5\n", - " 7\n", - " 15\n", - " 18\n", - " ...\n", - " NaN\n", - " +\n", - " +\n", - " +\n", - " NaN\n", - " NaN\n", - " +\n", - " −\n", - " −\n", - " Individual 4\n", - " \n", - " \n", - " Individual 5\n", - " c.3436C>T\n", - " (p.Arg1146Cys)\n", - " 5\n", - " Female\n", - " 41\n", - " +\n", - " 5\n", - " 6\n", - " 11\n", - " 18\n", - " ...\n", - " NaN\n", - " +\n", - " +\n", - " +\n", - " Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum\n", - " NaN\n", - " −\n", - " −\n", - " −\n", - " Individual 5\n", - " \n", " \n", "\n", - "

5 rows × 35 columns

\n", + "

2 rows × 35 columns

\n", "" ], "text/plain": [ "identifier Variant (hg19, NM_015133.4) Protein variant Age (yr) Sex \\\n", "Individual 1 c.1732C>T (p.Arg578Cys) 29 Male \n", "Individual 2 c.1732C>T (p.Arg578Cys) 27 Female \n", - "Individual 3 c.1732C>T (p.Arg578Cys) 16 Male \n", - "Individual 4 c.3436C>T (p.Arg1146Cys) 5 Male \n", - "Individual 5 c.3436C>T (p.Arg1146Cys) 5 Female \n", "\n", "identifier Gestational ages (weeks) Delayed motor development \\\n", "Individual 1 39 + \n", "Individual 2 40 + \n", - "Individual 3 40 + \n", - "Individual 4 36 + \n", - "Individual 5 41 + \n", "\n", "identifier Age at head control (months) Age at rolling (months) \\\n", "Individual 1 2.5 ND \n", "Individual 2 3.5 11 \n", - "Individual 3 4 6 \n", - "Individual 4 5 7 \n", - "Individual 5 5 6 \n", "\n", "identifier Age at unsupported sitting (months) Age at crawling (months) \\\n", - "Individual 1 7 Not acquired \n", + "Individual 1 7 468 \n", "Individual 2 6 11 \n", - "Individual 3 Not acquired ND \n", - "Individual 4 15 18 \n", - "Individual 5 11 18 \n", "\n", "identifier ... Facial dysmorphism Round face Prominent nasal bridge \\\n", "Individual 1 ... NaN + − \n", "Individual 2 ... NaN + − \n", - "Individual 3 ... NaN − + \n", - "Individual 4 ... NaN + + \n", - "Individual 5 ... NaN + + \n", - "\n", - "identifier Thin upper lip \\\n", - "Individual 1 + \n", - "Individual 2 + \n", - "Individual 3 + \n", - "Individual 4 + \n", - "Individual 5 + \n", "\n", - "identifier Others \\\n", - "Individual 1 NaN \n", - "Individual 2 NaN \n", - "Individual 3 NaN \n", - "Individual 4 NaN \n", - "Individual 5 Long and thick eyebrows, upper slanted palpebral fissures, anteverted nares, short philtrum \n", + "identifier Thin upper lip Others Other Short stature Obesity \\\n", + "Individual 1 + NaN NaN + + \n", + "Individual 2 + NaN NaN + + \n", "\n", - "identifier Other Short stature Obesity Precocious puberty patient_id \n", - "Individual 1 NaN + + + Individual 1 \n", - "Individual 2 NaN + + + Individual 2 \n", - "Individual 3 NaN + + ND Individual 3 \n", - "Individual 4 NaN + − − Individual 4 \n", - "Individual 5 NaN − − − Individual 5 \n", + "identifier Precocious puberty individual_id \n", + "Individual 1 + Individual 1 \n", + "Individual 2 + Individual 2 \n", "\n", - "[5 rows x 35 columns]" + "[2 rows x 35 columns]" ] }, "execution_count": 6, @@ -1173,8 +488,8 @@ ], "source": [ "dft.index\n", - "dft['patient_id'] = dft.index\n", - "dft.head()" + "dft['individual_id'] = dft.index\n", + "dft.head(2)" ] }, { @@ -1193,7 +508,7 @@ "metadata": {}, "outputs": [], "source": [ - "column_mapper_d = defaultdict(ColumnMapper)" + "column_mapper_list = list()" ] }, { @@ -1223,88 +538,203 @@ " \n", " \n", " \n", - " term\n", - " status\n", + " mapping\n", + " count\n", " \n", " \n", " \n", " \n", " 0\n", - " Motor delay (HP:0001270)\n", - " observed\n", - " \n", - " \n", - " 1\n", - " Motor delay (HP:0001270)\n", - " observed\n", + " original value: \"+\" -> HP: Motor delay (HP:0001270) (observed)\n", + " 5\n", " \n", - " \n", - " 2\n", - " Motor delay (HP:0001270)\n", - " observed\n", + " \n", + "\n", + "" + ], + "text/plain": [ + " mapping count\n", + "0 original value: \"+\" -> HP: Motor delay (HP:0001270) (observed) 5" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "delayedMotorMapper = SimpleColumnMapper(column_name='Delayed motor development',\n", + " hpo_id='HP:0001270',hpo_label='Motor delay',observed='+', excluded='-')\n", + "column_mapper_list.append(delayedMotorMapper)\n", + "delayedMotorMapper.preview_column(dft)" + ] + }, + { + "cell_type": "markdown", + "id": "03adfdb5", + "metadata": {}, + "source": [ + "

ThresholdedColumnMapper

\n", + "

Use this mapper for phenotypic features that are reported as ages (numbers). For instance, \n", + "if \"Age at head control (months)\" is over 4 months, we would call \n", + "Persistent head lag HP:0032988.

\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "82cfbe6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
mapping: None-4.0 monthcount
3Motor delay (HP:0001270)observed0Persistent head lag (HP:0032988): not measured3
4Motor delay (HP:0001270)observed1Persistent head lag (HP:0032988): observed2
\n", "
" ], "text/plain": [ - " term status\n", - "0 Motor delay (HP:0001270) observed\n", - "1 Motor delay (HP:0001270) observed\n", - "2 Motor delay (HP:0001270) observed\n", - "3 Motor delay (HP:0001270) observed\n", - "4 Motor delay (HP:0001270) observed" + " mapping: None-4.0 month count\n", + "0 Persistent head lag (HP:0032988): not measured 3\n", + "1 Persistent head lag (HP:0032988): observed 2" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "delayedMotorMapper = SimpleColumnMapper(hpo_id='HP:0001270',\n", - " hpo_label='Motor delay',\n", - " observed='+',\n", - " excluded='-')\n", - "delayedMotorMapper.preview_column(dft['Delayed motor development'])" + "persistentHL = HpTerm(hpo_id=\"HP:0032988\", label=\"Persistent head lag\")\n", + "headLag = Thresholder(hpo_term_high=persistentHL, threshold_high=4, unit=\"month\")\n", + "headLagMapper = ThresholdedColumnMapper(column_name=\"Age at head control (months)\",\n", + " thresholder=headLag)\n", + "column_mapper_list.append(headLagMapper)\n", + "headLagMapper.preview_column(dft)" + ] + }, + { + "cell_type": "markdown", + "id": "d5797592", + "metadata": {}, + "source": [ + "

Here is another example: \n", + "Delayed ability to roll over (HP:0032989). . We will use the threshold of 6 months.

" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "2e022f7e", + "execution_count": 10, + "id": "fa57c026", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mapping: None-6.0 monthcount
0Delayed ability to roll over (HP:0032989): not measured3
1Delayed ability to roll over (HP:0032989): observed2
\n", + "
" + ], + "text/plain": [ + " mapping: None-6.0 month count\n", + "0 Delayed ability to roll over (HP:0032989): not measured 3\n", + "1 Delayed ability to roll over (HP:0032989): observed 2" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "column_mapper_d['Delayed motor development'] = delayedMotorMapper" + "rollOver = HpTerm(hpo_id=\"HP:0032989\", label=\"Delayed ability to roll over\",)\n", + "rot = Thresholder(hpo_term_high=rollOver, threshold_high=6, unit=\"month\")\n", + "rollOverMappper = ThresholdedColumnMapper(column_name=\"Age at rolling (months)\", thresholder=rot)\n", + "column_mapper_list.append(rollOverMappper)\n", + "rollOverMappper.preview_column(dft)" ] }, { "cell_type": "markdown", - "id": "03adfdb5", + "id": "b9cc2d6b", "metadata": {}, "source": [ - "

ThresholdedColumnMapper

\n", - "

Use this mapper for phenotypic features that are reported as ages (numbers). For instance, \n", - "if \"Age at head control (months)\" is over 4 months, we would call \n", - "Persistent head lag HP:0032988.

\n", - "\n" + "

ThresholdedColumnMapper - special code

\n", + "

In some cases, phrases such as 'not attained' are used to denote that a child has not attained a certain milestone at the\n", + "time of last examination and this this constitutes an abnormal finding. In this case, the optional argument ''observed_code'' should be used.

" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "82cfbe6c", + "execution_count": 11, + "id": "13c44b0d", "metadata": {}, "outputs": [ { @@ -1328,83 +758,224 @@ " \n", " \n", " \n", - " term\n", - " status\n", + " mapping: None-9.0 month\n", + " count\n", " \n", " \n", " \n", " \n", " 0\n", - " Persistent head lag (HP:0032988)\n", - " excluded\n", + " Delayed ability to sit (HP:0025336): not measured\n", + " 2\n", " \n", " \n", " 1\n", - " Persistent head lag (HP:0032988)\n", - " excluded\n", + " Delayed ability to sit (HP:0025336): observed\n", + " 3\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " mapping: None-9.0 month count\n", + "0 Delayed ability to sit (HP:0025336): not measured 2\n", + "1 Delayed ability to sit (HP:0025336): observed 3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Age at unsupported sitting (months) \tthreshold: 9 months\n", + "delayedSit = HpTerm(hpo_id=\"HP:0025336\", label=\"Delayed ability to sit\")\n", + "sitThreshold = Thresholder(hpo_term_high=delayedSit, threshold_high=9, unit=\"month\")\n", + "delayedSittingMapper = ThresholdedColumnMapper(column_name=\"Age at unsupported sitting (months)\", thresholder=sitThreshold)\n", + "column_mapper_list.append(delayedSittingMapper)\n", + "delayedSittingMapper.preview_column(dft)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a7443f90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mapping: None-15.0 monthcount
2Persistent head lag (HP:0032988)excluded0Delayed ability to walk (HP:0031936): observed5
\n", + "
" + ], + "text/plain": [ + " mapping: None-15.0 month count\n", + "0 Delayed ability to walk (HP:0031936): observed 5" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Age at walking (months) - 15 months -- Delayed ability to walk HP:0031936\n", + "delayedWalk = HpTerm(hpo_id=\"HP:0031936\", label=\"Delayed ability to walk\")\n", + "walkTh = Thresholder(hpo_term_high=delayedWalk, threshold_high=15, unit=\"month\")\n", + "delayedWalkingMapper = ThresholdedColumnMapper( column_name=\"Age at walking (months)\", thresholder=walkTh)\n", + "column_mapper_list.append(delayedWalkingMapper)\n", + "delayedWalkingMapper.preview_column(dft)" + ] + }, + { + "cell_type": "markdown", + "id": "7fdaa77d-29b3-4ce4-8f05-9b8409cae82c", + "metadata": {}, + "source": [ + "

Other columns

\n", + "

The following \"simple\" columns are created in a loop for simplicity. The key represents the words used in the table, and the value is a two-element array with the corresponding HPO label and term id. See \n", + " row-based notebook\n", + " for more information.

" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a3e15290-0011-4aea-acf5-1810eb6d7e1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We created 13 simple column mappers\n" + ] + } + ], + "source": [ + "items = {\n", + " 'History of regression': [\"Developmental regression\",\"HP:0002376\"],\n", + " 'Spastic diplegia':['Spastic diplegia', 'HP:0001264'], # \n", + " 'Autistic behavior': ['Autistic behavior', 'HP:0000729'], # \n", + " 'Infantile hypotonia':['Infantile muscular hypotonia','HP:0008947'], # \n", + " 'Cerebral atrophy':[\"Cerebral atrophy\",\"HP:0002059\"], #\n", + " 'Delayed myelination':[\"Delayed CNS myelination\",\"HP:0002188\"], #\n", + " 'Corpus callosum hypoplasia':['Hypoplasia of the corpus callosum','HP:0002079'],#\n", + " 'Prominent nasal bridge':['Prominent nasal bridge','HP:0000426'], #\n", + " 'Thin upper lip':[\"Thin upper lip vermilion\",\"HP:0000219\"],\n", + " \"Round face\":[\"Round face\",\"HP:0000311\"],\n", + " \"Short stature\":[\"Short stature\",\"HP:0004322\"],\n", + " \"Obesity\":[\"Obesity\", \"HP:0001513\"],\n", + " \"Precocious puberty\":[\"Precocious puberty\", \"HP:0000826\"],\n", + "}\n", + "item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, observed='+',\n", + " excluded='-')\n", + "print(f\"We created {len(item_column_mapper_d)} simple column mappers\")\n", + "# Transfer to column_mapper_d\n", + "for k, v in item_column_mapper_d.items():\n", + " column_mapper_list.append(v)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "90b39bc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
mappingcount
3Persistent head lag (HP:0032988)observed0Intellectual disability, severe (HP:0010864) (observed)4
4Persistent head lag (HP:0032988)observed1Intellectual disability, profound (HP:0002187) (observed)1
\n", "
" ], "text/plain": [ - " term status\n", - "0 Persistent head lag (HP:0032988) excluded\n", - "1 Persistent head lag (HP:0032988) excluded\n", - "2 Persistent head lag (HP:0032988) excluded\n", - "3 Persistent head lag (HP:0032988) observed\n", - "4 Persistent head lag (HP:0032988) observed" + " mapping count\n", + "0 Intellectual disability, severe (HP:0010864) (observed) 4\n", + "1 Intellectual disability, profound (HP:0002187) (observed) 1" ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "headLagMapper = ThresholdedColumnMapper(hpo_id=\"HP:0032988\", hpo_label=\"Persistent head lag\", \n", - " threshold=4, call_if_above=True)\n", - "headLagMapper.preview_column(dft[\"Age at head control (months)\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "1d23c283", - "metadata": {}, - "outputs": [], - "source": [ - "column_mapper_d[\"Age at head control (months)\"] = headLagMapper" - ] - }, - { - "cell_type": "markdown", - "id": "d5797592", - "metadata": {}, - "source": [ - "

Here is another example: \n", - "Delayed ability to roll over (HP:0032989). . We will use the threshold of 6 months.

" + "severity_d = {'Severe': 'Intellectual disability, severe',\n", + " 'Profound': 'Intellectual disability, profound'}\n", + "idMapper = OptionColumnMapper(column_name='Intellectual disability',\n", + " concept_recognizer=hpo_cr, option_d=severity_d)\n", + "column_mapper_list.append(idMapper)\n", + "idMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "fa57c026", + "execution_count": 15, + "id": "89834a4d", "metadata": {}, "outputs": [ { @@ -1428,84 +999,50 @@ " \n", " \n", " \n", - " term\n", - " status\n", + " mapping\n", + " count\n", " \n", " \n", " \n", " \n", " 0\n", - " Delayed ability to roll over (HP:0032989)\n", - " not measured\n", + " Delayed speech and language development (HP:0000750) (observed)\n", + " 3\n", " \n", " \n", " 1\n", - " Delayed ability to roll over (HP:0032989)\n", - " observed\n", - " \n", - " \n", - " 2\n", - " Delayed ability to roll over (HP:0032989)\n", - " excluded\n", - " \n", - " \n", - " 3\n", - " Delayed ability to roll over (HP:0032989)\n", - " observed\n", - " \n", - " \n", - " 4\n", - " Delayed ability to roll over (HP:0032989)\n", - " excluded\n", + " Absent speech (HP:0001344) (observed)\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " term status\n", - "0 Delayed ability to roll over (HP:0032989) not measured\n", - "1 Delayed ability to roll over (HP:0032989) observed\n", - "2 Delayed ability to roll over (HP:0032989) excluded\n", - "3 Delayed ability to roll over (HP:0032989) observed\n", - "4 Delayed ability to roll over (HP:0032989) excluded" + " mapping count\n", + "0 Delayed speech and language development (HP:0000750) (observed) 3\n", + "1 Absent speech (HP:0001344) (observed) 2" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rollOverMappper = ThresholdedColumnMapper(hpo_id=\"HP:0032989\", hpo_label=\"Delayed ability to roll over\", \n", - " threshold=6, call_if_above=True)\n", - "rollOverMappper.preview_column(dft[\"Age at rolling (months)\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "dc32854a", - "metadata": {}, - "outputs": [], - "source": [ - "column_mapper_d[\"Age at rolling (months)\"] = rollOverMappper" - ] - }, - { - "cell_type": "markdown", - "id": "b9cc2d6b", - "metadata": {}, - "source": [ - "

ThresholdedColumnMapper - special code

\n", - "

In some cases, phrases such as 'not attained' are used to denote that a child has not attained a certain milestone at the\n", - "time of last examination and this this constitutes an abnormal finding. In this case, the optional argument ''observed_code'' should be used.

" + "# Language skills\n", + "language_d = {'Simple two-word sentences': 'Delayed speech and language development',\n", + " 'Simple words': 'Delayed speech and language development',\n", + " 'Nonverbal': 'Absent speech'}\n", + "languageMapper = OptionColumnMapper(column_name='Language skills',concept_recognizer=hpo_cr, option_d=language_d)\n", + "column_mapper_list.append(languageMapper)\n", + "languageMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "13c44b0d", + "execution_count": 16, + "id": "eb0933ce", "metadata": {}, "outputs": [ { @@ -1529,65 +1066,51 @@ " \n", " \n", " \n", - " term\n", - " status\n", + " mapping\n", + " count\n", " \n", " \n", " \n", " \n", " 0\n", - " Delayed ability to sit (HP:0025336)\n", - " excluded\n", + " Loss of ambulation (HP:0002505) (observed)\n", + " 3\n", " \n", " \n", " 1\n", - " Delayed ability to sit (HP:0025336)\n", - " excluded\n", - " \n", - " \n", - " 2\n", - " Delayed ability to sit (HP:0025336)\n", - " observed\n", - " \n", - " \n", - " 3\n", - " Delayed ability to sit (HP:0025336)\n", - " observed\n", - " \n", - " \n", - " 4\n", - " Delayed ability to sit (HP:0025336)\n", - " observed\n", + " Delayed gross motor development (HP:0002194) (observed)\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " term status\n", - "0 Delayed ability to sit (HP:0025336) excluded\n", - "1 Delayed ability to sit (HP:0025336) excluded\n", - "2 Delayed ability to sit (HP:0025336) observed\n", - "3 Delayed ability to sit (HP:0025336) observed\n", - "4 Delayed ability to sit (HP:0025336) observed" + " mapping count\n", + "0 Loss of ambulation (HP:0002505) (observed) 3\n", + "1 Delayed gross motor development (HP:0002194) (observed) 1" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Age at unsupported sitting (months) \tthreshold: 9 months\n", - "delayedSittingMapper = ThresholdedColumnMapper(hpo_id=\"HP:0025336\", hpo_label=\"Delayed ability to sit\", \n", - " threshold=9, call_if_above=True, observed_code='Not acquired')\n", - "delayedSittingMapper.preview_column(dft[\"Age at unsupported sitting (months)\"])" + "# Gross motor skills Wheelchair bound \tWheelchair bound \tWheelchair bound \tCruising (5y)\tWalking (5y)\n", + "gms_d = {\n", + " \"Wheelchair bound\": \"Loss of ambulation\",\n", + " \"Cruising\": \"Delayed gross motor development\"\n", + "}\n", + "gmsMapper = OptionColumnMapper(column_name='Gross motor skills',concept_recognizer=hpo_cr, option_d=gms_d)\n", + "column_mapper_list.append(gmsMapper)\n", + "gmsMapper.preview_column(dft)\n" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "a7443f90", + "execution_count": 17, + "id": "65c27c04", "metadata": {}, "outputs": [ { @@ -1611,180 +1134,56 @@ " \n", " \n", " \n", - " term\n", - " status\n", + " mapping\n", + " count\n", " \n", " \n", " \n", " \n", " 0\n", - " Delayed ability to walk (HP:0031936)\n", - " observed\n", + " Thick eyebrow (HP:0000574) (observed)\n", + " 1\n", " \n", " \n", " 1\n", - " Delayed ability to walk (HP:0031936)\n", - " observed\n", + " Upslanted palpebral fissure (HP:0000582) (observed)\n", + " 1\n", " \n", " \n", " 2\n", - " Delayed ability to walk (HP:0031936)\n", - " observed\n", + " Anteverted nares (HP:0000463) (observed)\n", + " 1\n", " \n", " \n", " 3\n", - " Delayed ability to walk (HP:0031936)\n", - " observed\n", - " \n", - " \n", - " 4\n", - " Delayed ability to walk (HP:0031936)\n", - " observed\n", + " Short philtrum (HP:0000322) (observed)\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " term status\n", - "0 Delayed ability to walk (HP:0031936) observed\n", - "1 Delayed ability to walk (HP:0031936) observed\n", - "2 Delayed ability to walk (HP:0031936) observed\n", - "3 Delayed ability to walk (HP:0031936) observed\n", - "4 Delayed ability to walk (HP:0031936) observed" + " mapping count\n", + "0 Thick eyebrow (HP:0000574) (observed) 1\n", + "1 Upslanted palpebral fissure (HP:0000582) (observed) 1\n", + "2 Anteverted nares (HP:0000463) (observed) 1\n", + "3 Short philtrum (HP:0000322) (observed) 1" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Age at walking (months) - 15 months -- Delayed ability to walk HP:0031936\n", - "delayedWalkingMapper = ThresholdedColumnMapper(hpo_id=\"HP:0031936\", hpo_label=\"Delayed ability to walk\", \n", - " threshold=15, call_if_above=True, observed_code='Not acquired')\n", - "delayedWalkingMapper.preview_column(dft[\"Age at walking (months)\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "9db4157a", - "metadata": {}, - "outputs": [], - "source": [ - "column_mapper_d[\"Age at unsupported sitting (months)\"] = delayedSittingMapper\n", - "column_mapper_d[\"Age at walking (months)\"] = delayedWalkingMapper" - ] - }, - { - "cell_type": "markdown", - "id": "7fdaa77d-29b3-4ce4-8f05-9b8409cae82c", - "metadata": {}, - "source": [ - "

Other columns

\n", - "

The following \"simple\" columns are created in a loop for simplicity. The key represents the words used in the table, and the value is a two-element array with the corresponding HPO label and term id. See \n", - " row-based notebook\n", - " for more information.

" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "a3e15290-0011-4aea-acf5-1810eb6d7e1b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We created 13 simple column mappers\n" - ] - } - ], - "source": [ - "items = {\n", - " 'History of regression': [\"Developmental regression\",\"HP:0002376\"],\n", - " 'Spastic diplegia':['Spastic diplegia', 'HP:0001264'], # \n", - " 'Autistic behavior': ['Autistic behavior', 'HP:0000729'], # \n", - " 'Infantile hypotonia':['Infantile muscular hypotonia','HP:0008947'], # \n", - " 'Cerebral atrophy':[\"Cerebral atrophy\",\"HP:0002059\"], #\n", - " 'Delayed myelination':[\"Delayed CNS myelination\",\"HP:0002188\"], #\n", - " 'Corpus callosum hypoplasia':['Hypoplasia of the corpus callosum','HP:0002079'],#\n", - " 'Prominent nasal bridge':['Prominent nasal bridge','HP:0000426'], #\n", - " 'Thin upper lip':[\"Thin upper lip vermilion\",\"HP:0000219\"],\n", - " \"Round face\":[\"Round face\",\"HP:0000311\"],\n", - " \"Short stature\":[\"Short stature\",\"HP:0004322\"],\n", - " \"Obesity\":[\"Obesity\", \"HP:0001513\"],\n", - " \"Precocious puberty\":[\"Precocious puberty\", \"HP:0000826\"],\n", - "}\n", - "item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, observed='+',\n", - " excluded='-')\n", - "print(f\"We created {len(item_column_mapper_d)} simple column mappers\")\n", - "# Transfer to column_mapper_d\n", - "for k, v in item_column_mapper_d.items():\n", - " column_mapper_d[k] = v" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "90b39bc5", - "metadata": {}, - "outputs": [], - "source": [ - "severity_d = {'Severe': 'Intellectual disability, severe',\n", - " 'Profound': 'Intellectual disability, profound'}\n", - "idMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=severity_d)\n", - "#idMapper.preview_column(dft['Intellectual disability'])\n", - "column_mapper_d['Intellectual disability'] = idMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "89834a4d", - "metadata": {}, - "outputs": [], - "source": [ - "# Language skills\n", - "language_d = {'Simple two-word sentences': 'Delayed speech and language development',\n", - " 'Simple words': 'Delayed speech and language development',\n", - " 'Nonverbal': 'Absent speech'}\n", - "languageMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=language_d)\n", - "# languageMapper.preview_column(dft['Language skills'])\n", - "column_mapper_d['Language skills'] = languageMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "eb0933ce", - "metadata": {}, - "outputs": [], - "source": [ - "# Gross motor skills Wheelchair bound \tWheelchair bound \tWheelchair bound \tCruising (5y)\tWalking (5y)\n", - "gms_d = {\n", - " \"Wheelchair bound\": \"Loss of ambulation\",\n", - " \"Cruising\": \"Delayed gross motor development\"\n", + "other_d = {\n", + " \"Long and thick eyebrows, \": [\"Thick eyebrows\", \"Long eyebrows\"],\n", + " \"upper slanted palpebral fissures\": \"Upslanted palpebral fissure\",\n", "}\n", - "gmsMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=gms_d)\n", - "# gmsMapper.preview_column(dft['Gross motor skills'])\n", - "column_mapper_d['Gross motor skills'] = gmsMapper" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "65c27c04", - "metadata": {}, - "outputs": [], - "source": [ - "# Others\n", - "other_d = {'upper slanted palpebral fissures': 'Upslanted palpebral fissure'}\n", - "otherMapper = CustomColumnMapper(concept_recognizer=hpo_cr, custom_map_d=other_d)\n", - "#otherMapper.preview_column(dft['Others'])\n", - "column_mapper_d['Others'] = otherMapper" + "otherMapper = OptionColumnMapper(column_name='Others',concept_recognizer=hpo_cr, option_d=other_d)\n", + "column_mapper_list.append(otherMapper)\n", + "otherMapper.preview_column(dft)" ] }, { @@ -1798,39 +1197,24 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "96c63cef-cb8c-4cb0-bc60-5060ff48eb2d", + "execution_count": 18, + "id": "21517a08", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_015133.4%3Ac.1732C>T/NM_015133.4?content-type=application%2Fjson\n", - "https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_015133.4%3Ac.3436C>T/NM_015133.4?content-type=application%2Fjson\n" - ] - } - ], + "outputs": [], "source": [ - "hg38 = 'hg38'\n", "MAPK8IP3_transcript='NM_015133.4'\n", - "var_list = dft['Variant (hg19, NM_015133.4)'].unique()\n", - "vvalidator = VariantValidator(genome_build=hg38, transcript=MAPK8IP3_transcript)\n", - "var_d = {}\n", - "for v in var_list:\n", - " var_d[v] = vvalidator.encode_hgvs(v)" + "vman = VariantManager(df=dft, individual_column_name=\"individual_id\",transcript=MAPK8IP3_transcript,gene_symbol=\"MAPK8IP3\",\n", + " gene_id=\"HGNC:6884\", allele_1_column_name=\"Variant (hg19, NM_015133.4)\")" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "da4d5706", "metadata": {}, "outputs": [], "source": [ - "varMapper = VariantColumnMapper(variant_d=var_d,\n", - " variant_column_name='Variant (hg19, NM_015133.4)', \n", - " default_genotype='heterozygous')\n", + "varMapper = VariantColumnMapper(variant_column_name=\"Variant (hg19, NM_015133.4)\", variant_d=vman.get_variant_d(),default_genotype=\"heterozygous\")\n", "#varMapper.preview_column(dft['Variant (hg19, NM_015133.4)'])" ] }, @@ -1845,7 +1229,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "id": "3e64dc08", "metadata": {}, "outputs": [ @@ -1877,49 +1261,49 @@ " \n", " \n", " 0\n", - " 29\n", " P29Y\n", + " 29\n", " \n", " \n", " 1\n", - " 27\n", " P27Y\n", + " 27\n", " \n", " \n", " 2\n", - " 16\n", " P16Y\n", + " 16\n", " \n", " \n", " 3\n", - " 5\n", " P5Y\n", + " 5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " original column contents age\n", - "0 29 P29Y\n", - "1 27 P27Y\n", - "2 16 P16Y\n", - "3 5 P5Y" + " original column contents age\n", + "0 P29Y 29\n", + "1 P27Y 27\n", + "2 P16Y 16\n", + "3 P5Y 5" ] }, - "execution_count": 24, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ageMapper = AgeColumnMapper.by_year('Age (yr)')\n", - "ageMapper.preview_column(dft['Age (yr)'])" + "ageMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 21, "id": "71f664cc", "metadata": {}, "outputs": [ @@ -1987,38 +1371,41 @@ "4 Female FEMALE" ] }, - "execution_count": 25, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sexMapper = SexColumnMapper(male_symbol='Male', female_symbol='Female', column_name='Sex')\n", - "sexMapper.preview_column(dft['Sex'])" + "sexMapper.preview_column(dft)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "id": "f6581a8a", "metadata": {}, "outputs": [], "source": [ - "encoder = CohortEncoder(df=dft, hpo_cr=hpo_cr, column_mapper_d=column_mapper_d, \n", - " individual_column_name=\"patient_id\", \n", - " agemapper=ageMapper, \n", + "encoder = CohortEncoder(df=dft, \n", + " hpo_cr=hpo_cr, \n", + " column_mapper_list=column_mapper_list, \n", + " individual_column_name=\"individual_id\", \n", + " age_at_last_encounter_mapper=ageMapper, \n", " sexmapper=sexMapper,\n", - " metadata=metadata,\n", " variant_mapper=varMapper,\n", - " pmid=PMID)\n", + " metadata=metadata,\n", + " )\n", "disease_id = \"OMIM:618443\"\n", "disease_label = \"Neurodevelopmental disorder with or without variable brain abnormalities\"\n", - "encoder.set_disease(disease_id=disease_id, label=disease_label)" + "NEDBA = Disease(disease_id=disease_id, disease_label=disease_label)\n", + "encoder.set_disease(disease=NEDBA)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "id": "fd367ed6", "metadata": {}, "outputs": [], @@ -2028,7 +1415,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 24, "id": "5d044b78", "metadata": {}, "outputs": [ @@ -2050,6 +1437,12 @@ " \"phenotypicFeatures\": [\n", " {\n", " \"type\": {\n", + " \"id\": \"HP:0001270\",\n", + " \"label\": \"Motor delay\"\n", + " }\n", + " },\n", + " {\n", + " \"type\": {\n", " \"id\": \"HP:0031936\",\n", " \"label\": \"Delayed ability to walk\"\n", " }\n", @@ -2119,20 +1512,6 @@ " \"id\": \"HP:0002505\",\n", " \"label\": \"Loss of ambulation\"\n", " }\n", - " },\n", - " {\n", - " \"type\": {\n", - " \"id\": \"HP:0032988\",\n", - " \"label\": \"Persistent head lag\"\n", - " },\n", - " \"excluded\": true\n", - " },\n", - " {\n", - " \"type\": {\n", - " \"id\": \"HP:0025336\",\n", - " \"label\": \"Delayed ability to sit\"\n", - " },\n", - " \"excluded\": true\n", " }\n", " ],\n", " \"interpretations\": [\n", @@ -2150,7 +1529,7 @@ " \"interpretationStatus\": \"CAUSATIVE\",\n", " \"variantInterpretation\": {\n", " \"variationDescriptor\": {\n", - " \"id\": \"var_EQquVDqFAjuRzfFzBqZbpeLcW\",\n", + " \"id\": \"var_iLuzWxvqVAnwkNmpilMQKXjQP\",\n", " \"geneContext\": {\n", " \"valueId\": \"HGNC:6884\",\n", " \"symbol\": \"MAPK8IP3\"\n", @@ -2184,8 +1563,16 @@ " }\n", " }\n", " ],\n", + " \"diseases\": [\n", + " {\n", + " \"term\": {\n", + " \"id\": \"OMIM:618443\",\n", + " \"label\": \"Neurodevelopmental disorder with or without variable brain abnormalities\"\n", + " }\n", + " }\n", + " ],\n", " \"metaData\": {\n", - " \"created\": \"2023-10-14T18:14:15.413787841Z\",\n", + " \"created\": \"2024-05-28T06:24:22.878716945Z\",\n", " \"createdBy\": \"ORCID:0000-0002-0736-9199\",\n", " \"resources\": [\n", " {\n", @@ -2224,7 +1611,7 @@ " \"id\": \"hp\",\n", " \"name\": \"human phenotype ontology\",\n", " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", - " \"version\": \"2023-10-09\",\n", + " \"version\": \"2024-04-26\",\n", " \"namespacePrefix\": \"HP\",\n", " \"iriPrefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", " }\n", @@ -2261,22 +1648,37 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 25, "id": "8e73ea60-3db9-4e9a-9e2b-421fbdb9b0c7", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "We found 0 validation errors\n" - ] + "data": { + "text/html": [ + "

Cohort validation

\n", + "

Errors found with 5 of 5 phenopackets.

\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Error counts
LevelError categoryCount
WARNINGREDUNDANT6
INFORMATIONNOT_MEASURED37
\n", + "

A total of 43 issues were fixed and no individual was removed from the cohort.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "cvalidator = ContentValidator(min_var=1, min_hpo=1)\n", - "errors = cvalidator.validate_phenopacket_list([i.to_ga4gh_phenopacket(metadata.to_ga4gh()) for i in individuals])\n", - "print(f\"We found {len(errors)} validation errors\")" + "\n", + "cvalidator = CohortValidator(cohort=individuals, ontology=hp_ontology, min_hpo=1,\n", + " allelic_requirement=AllelicRequirement.MONO_ALLELIC)\n", + "qc = QcVisualizer(cohort_validator=cvalidator)\n", + "display(HTML(qc.to_summary_html()))" ] }, { @@ -2290,55 +1692,22 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 26, "id": "71427f90-9bc6-4441-9aff-95c181dbcec2", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\n", - "\n", + "
\n", "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
5 phenopackets - PMID:30945334 (n=5)
IndividualDiseaseGenotypePhenotypic features
Individual 1 (MALE; P29Y)\n", - "Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)\n", - "NM_015133.4:c.1732C>T (heterozygous)Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Obesity (HP:0001513); Precocious puberty (HP:0000826); Intellectual disability, severe (HP:0010864); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)
Individual 2 (FEMALE; P27Y)\n", - "Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)\n", - "NM_015133.4:c.1732C>T (heterozygous)Delayed ability to roll over (HP:0032989); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Obesity (HP:0001513); Precocious puberty (HP:0000826); Intellectual disability, severe (HP:0010864); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)
Individual 3 (MALE; P16Y)\n", - "Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)\n", - "NM_015133.4:c.1732C>T (heterozygous)Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Short stature (HP:0004322); Obesity (HP:0001513); Intellectual disability, profound (HP:0002187); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)
Individual 4 (MALE; P5Y)\n", - "Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)\n", - "NM_015133.4:c.3436C>T (heterozygous)Persistent head lag (HP:0032988); Delayed ability to roll over (HP:0032989); Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Autistic behavior (HP:0000729); Infantile muscular hypotonia (HP:0008947); Cerebral atrophy (HP:0002059); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Intellectual disability, severe (HP:0010864); Absent speech (HP:0001344)
Individual 5 (FEMALE; P5Y)\n", - "Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)\n", - "NM_015133.4:c.3436C>T (heterozygous)Persistent head lag (HP:0032988); Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Autistic behavior (HP:0000729); Infantile muscular hypotonia (HP:0008947); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Intellectual disability, severe (HP:0010864); Absent speech (HP:0001344); Thick eyebrow (HP:0000574); Upslanted palpebral fissure (HP:0000582); Anteverted nares (HP:0000463); Short philtrum (HP:0000322)
\n" + "IndividualDiseaseGenotypePhenotypic features\n", + "Individual 1 (MALE; P29Y)Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)NM_015133.4:c.1732C>T (heterozygous)Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Obesity (HP:0001513); Precocious puberty (HP:0000826); Intellectual disability, severe (HP:0010864); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)\n", + "Individual 2 (FEMALE; P27Y)Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)NM_015133.4:c.1732C>T (heterozygous)Delayed ability to roll over (HP:0032989); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Obesity (HP:0001513); Precocious puberty (HP:0000826); Intellectual disability, severe (HP:0010864); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)\n", + "Individual 3 (MALE; P16Y)Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)NM_015133.4:c.1732C>T (heterozygous)Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Short stature (HP:0004322); Obesity (HP:0001513); Intellectual disability, profound (HP:0002187); Delayed speech and language development (HP:0000750); Loss of ambulation (HP:0002505)\n", + "Individual 4 (MALE; P5Y)Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)NM_015133.4:c.3436C>T (heterozygous)Persistent head lag (HP:0032988); Delayed ability to roll over (HP:0032989); Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Autistic behavior (HP:0000729); Infantile muscular hypotonia (HP:0008947); Cerebral atrophy (HP:0002059); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Short stature (HP:0004322); Intellectual disability, severe (HP:0010864); Absent speech (HP:0001344)\n", + "Individual 5 (FEMALE; P5Y)Neurodevelopmental disorder with or without variable brain abnormalities (OMIM:618443)NM_015133.4:c.3436C>T (heterozygous)Persistent head lag (HP:0032988); Delayed ability to sit (HP:0025336); Delayed ability to walk (HP:0031936); Spastic diplegia (HP:0001264); Autistic behavior (HP:0000729); Infantile muscular hypotonia (HP:0008947); Cerebral atrophy (HP:0002059); Delayed CNS myelination (HP:0002188); Prominent nasal bridge (HP:0000426); Thin upper lip vermilion (HP:0000219); Round face (HP:0000311); Intellectual disability, severe (HP:0010864); Absent speech (HP:0001344); Thick eyebrow (HP:0000574); Upslanted palpebral fissure (HP:0000582); Anteverted nares (HP:0000463); Short philtrum (HP:0000322)\n", + "" ], "text/plain": [ "" @@ -2349,9 +1718,8 @@ } ], "source": [ - "from IPython.display import HTML, display\n", - "phenopackets = [i.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh()) for i in individuals]\n", - "table = PhenopacketTable(phenopacket_list=phenopackets)\n", + "individuals = cvalidator.get_error_free_individual_list()\n", + "table = IndividualTable(individuals)\n", "display(HTML(table.to_html()))" ] }, @@ -2366,7 +1734,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 27, "id": "31305025", "metadata": {}, "outputs": [ @@ -2380,8 +1748,7 @@ ], "source": [ "Individual.output_individuals_as_phenopackets(individual_list=individuals, \n", - " pmid=PMID, \n", - " metadata=metadata.to_ga4gh(), \n", + " metadata=metadata, \n", " outdir=\"phenopackets\")" ] }, @@ -2398,7 +1765,7 @@ "kernelspec": { "display_name": "ppt_venv", "language": "python", - "name": "ppt_venv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -2410,7 +1777,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/notebooks/data/PMID_30945334.xlsx b/notebooks/data/PMID_30945334.xlsx index 63c8e59e..65057e62 100644 Binary files a/notebooks/data/PMID_30945334.xlsx and b/notebooks/data/PMID_30945334.xlsx differ diff --git a/notebooks/data/PMID_31278393.xlsx b/notebooks/data/PMID_31278393.xlsx index d1a90c95..3277a6a6 100644 Binary files a/notebooks/data/PMID_31278393.xlsx and b/notebooks/data/PMID_31278393.xlsx differ diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py index 393273a5..df42be87 100644 --- a/src/pyphetools/__init__.py +++ b/src/pyphetools/__init__.py @@ -5,7 +5,7 @@ from . import validation -__version__ = "0.9.85" +__version__ = "0.9.94" __all__ = [ diff --git a/src/pyphetools/creation/__init__.py b/src/pyphetools/creation/__init__.py index b0cfc8fe..615bca6b 100644 --- a/src/pyphetools/creation/__init__.py +++ b/src/pyphetools/creation/__init__.py @@ -21,6 +21,7 @@ from .import_template import TemplateImporter from .individual import Individual from .metadata import MetaData +from .mode_of_inheritance import Moi from .option_column_mapper import OptionColumnMapper from .pyphetools_age import PyPheToolsAge, IsoAge, HpoAge, GestationalAge, HPO_ONSET_TERMS from .sex_column_mapper import SexColumnMapper diff --git a/src/pyphetools/creation/case_template_encoder.py b/src/pyphetools/creation/case_template_encoder.py index 91634cad..49200fc4 100644 --- a/src/pyphetools/creation/case_template_encoder.py +++ b/src/pyphetools/creation/case_template_encoder.py @@ -375,8 +375,14 @@ def _parse_individual(self, row:pd.Series): raise ValueError(f"Insufficient data items: \"{data_items}\"") # If we get here, we can contruct an individual individual_id = data_items.get('individual_id') + if individual_id is None or isinstance(individual_id, float) or len(individual_id) == 0: + raise ValueError(f"Empty individual_id field for {row}") pmid = data_items.get("PMID") title = data_items.get("title") + if pmid is None or isinstance(pmid, float) or not pmid.startswith("PMID"): + raise ValueError(f"Could not find PubMed identifier for {individual_id}") + if title is None or isinstance(title, float) or len(title) < 5: + raise ValueError(f"Could not find valid title for {individual_id}") citation = Citation(pmid=pmid, title=title) sex = data_items.get("sex") if sex == "M": diff --git a/src/pyphetools/creation/citation.py b/src/pyphetools/creation/citation.py index fce44fe2..4c9c2e8f 100644 --- a/src/pyphetools/creation/citation.py +++ b/src/pyphetools/creation/citation.py @@ -10,7 +10,11 @@ class Citation: :type title: str """ - def __init__(self, pmid, title) -> None: + def __init__(self, pmid:str, title:str) -> None: + if pmid is None or isinstance(pmid, float) or not pmid.startswith("PMID"): + raise ValueError(f"Could not find PubMed identifier") + if title is None or isinstance(title, float) or len(title) < 5: + raise ValueError(f"Could not find valid title") self._pmid = pmid self._title = title diff --git a/src/pyphetools/creation/create_template.py b/src/pyphetools/creation/create_template.py index 98d8c465..39ff95db 100644 --- a/src/pyphetools/creation/create_template.py +++ b/src/pyphetools/creation/create_template.py @@ -51,7 +51,7 @@ def arrange_terms(self) -> List[hpotk.model.TermId]: hp_term_list = list() ## Arrange hp_terms so that all terms that belong to a given top level term go together PHENO_ROOT_TERM_ID = "HP:0000118" - top_level_term_ids = self._hpo_ontology.graph.get_children(PHENO_ROOT_TERM_ID, False) + top_level_term_ids = self._hpo_ontology.graph.get_children(PHENO_ROOT_TERM_ID, True) top_level_term_ids = list(top_level_term_ids) top_level_d = defaultdict(list) for hpt in self._all_added_hp_term_set: diff --git a/src/pyphetools/creation/import_template.py b/src/pyphetools/creation/import_template.py index f9dbf185..77caade8 100644 --- a/src/pyphetools/creation/import_template.py +++ b/src/pyphetools/creation/import_template.py @@ -3,6 +3,7 @@ from collections import defaultdict import typing import phenopackets as PPKt +from .mode_of_inheritance import Moi class TemplateImporter: @@ -245,15 +246,15 @@ def filter_diseases(disease_id, ppkt_list): def create_hpoa_from_phenopackets(self, pmid:str, - moi:str, + mode_of_inheritance:Moi, ppkt_dir:str="phenopackets", target:str=None) -> pd.DataFrame: """Create an HPO annotation (HPOA) file from the current cohort :param pmid: PubMed id for the mode of inheritance :type pmid: str - :param moi: Mode of inheritance (Autosomal dominant, Autosomal recessive, etc) - :type moi: str + :param mode_of_inheritance: Mode of inheritance (enumeration) + :type mode_of_inheritance: Moi :param ppkt_dir: Directory with phenopackets Defaults to "phenopackets". :param target: Disease id (e.g., OMIM:600123) to select only phenopackets with this disease. Defaults to None. :type target: str @@ -271,18 +272,7 @@ def create_hpoa_from_phenopackets(self, ppkt_list = TemplateImporter.filter_diseases(target, ppkt_list) TemplateImporter.check_disease_entries(ppkt_list) builder = HpoaTableBuilder(phenopacket_list=ppkt_list, created_by=self._created_by) - if moi == "Autosomal dominant": - builder.autosomal_dominant(pmid) - elif moi == "Autosomal recessive": - builder.autosomal_recessive(pmid) - elif moi == "X-linked inheritance": - builder.x_linked(pmid) - elif moi == "X-linked recessive inheritance": - builder.x_linked_recessive(pmid) - elif moi == "X-linked dominant inheritance": - builder.x_linked_dominant() - else: - raise ValueError(f"Did not recognize mode of inheritance {moi}") + builder.add_moi(mode_of_inheritance=mode_of_inheritance, pmid=pmid) hpoa_creator = builder.build() hpoa_creator.write_data_frame() return hpoa_creator.get_dataframe() diff --git a/src/pyphetools/creation/individual.py b/src/pyphetools/creation/individual.py index 28071a56..fba9e7fe 100644 --- a/src/pyphetools/creation/individual.py +++ b/src/pyphetools/creation/individual.py @@ -376,6 +376,8 @@ def output_individuals_as_phenopackets(individual_list, metadata:MetaData, outdi if not os.path.isdir(outdir): os.makedirs(outdir) written = 0 + if not isinstance(metadata, MetaData): + raise ValueError(f"metadata argument must be pyphetools MetaData object (not GA4GH metadata message), but was {type(metadata)}") pmid = metadata.get_pmid() for individual in individual_list: phenopckt = individual.to_ga4gh_phenopacket(metadata=metadata) diff --git a/src/pyphetools/creation/mode_of_inheritance.py b/src/pyphetools/creation/mode_of_inheritance.py new file mode 100644 index 00000000..da03a5ee --- /dev/null +++ b/src/pyphetools/creation/mode_of_inheritance.py @@ -0,0 +1,31 @@ +from enum import Enum +from .hp_term import HpTerm + +class Moi(Enum): + AD="Autosomal dominant inheritance" + AR="Autosomal recessive inheritance" + XLI="X-linked inheritance" + XLR="X-linked recessive inheritance" + XLD="X-linked dominant inheritance" + MITO="Mitochondrial inheritance" + YLI="Y-linked inheritance" + + + def to_HPO(self): + if self == Moi.AD: + return HpTerm(hpo_id="HP:0000006", label="Autosomal dominant inheritance") + elif self == Moi.AR: + return HpTerm(hpo_id="HP:0000007", label="Autosomal recessive inheritance") + elif self == Moi.XLI: + return HpTerm(hpo_id="HP:0001417", label="X-linked inheritance") + elif self == Moi.XLR: + return HpTerm(hpo_id="HP:0001419", label="X-linked recessive inheritance") + elif self == Moi.XLD: + return HpTerm(hpo_id="HP:0001423", label="X-linked dominant inheritance") + elif self == Moi.MITO: + return HpTerm(hpo_id="HP:0001427", label="Mitochondrial inheritance") + elif self == Moi.YLI: + return HpTerm(hpo_id="HP:0001450", label="Y-linked inheritance") + + else: + raise ValueError(f"Unrecognized Moi enum (should never happen)") diff --git a/src/pyphetools/creation/pyphetools_age.py b/src/pyphetools/creation/pyphetools_age.py index 9036e33b..7beb739e 100644 --- a/src/pyphetools/creation/pyphetools_age.py +++ b/src/pyphetools/creation/pyphetools_age.py @@ -17,6 +17,12 @@ "Middle age onset": "HP:0003596", # Onset of symptoms after the age of 16 years. "Young adult onset":"HP:0011462", + # Onset of disease at an age of greater than or equal to 25 to under 40 years. + "Late young adult onset": "HP:0025710", + # Onset of disease at an age of greater than or equal to 19 to under 25 years. + "Intermediate young adult onset": "HP:0025709", + # Onset of disease at an age of greater than or equal to 16 to under 19 years. + "Early young adult onset": "HP:0025708", # Onset of disease after 16 years . "Adult onset": "HP:0003581", #Onset of signs or symptoms of disease between the age of 5 and 15 years. diff --git a/src/pyphetools/visualization/hpoa_table_creator.py b/src/pyphetools/visualization/hpoa_table_creator.py index 24115e11..5afcc9d6 100644 --- a/src/pyphetools/visualization/hpoa_table_creator.py +++ b/src/pyphetools/visualization/hpoa_table_creator.py @@ -11,6 +11,7 @@ from ..creation.hp_term import HpTerm from ..creation.individual import Individual from ..creation.metadata import MetaData +from ..creation.mode_of_inheritance import Moi from .counted_hpo_term import CountedHpoTerm, CohortTermCounter from .onset_calculator import OnsetCalculator @@ -347,6 +348,13 @@ def filter_diseases(disease_id, ppkt_list): break print(f"[INFO] Extracted {(len(target_list))} from {(len(ppkt_list))} phenopackets with {disease_id}\n") return target_list + + def add_moi(self, mode_of_inheritance:Moi, pmid:str): + """ + Use this method to add mode of inheritance (MOI) data from a publication with the indicated pmid + """ + self._moi_d[pmid].append(mode_of_inheritance.to_HPO()) + def autosomal_recessive(self, pmid): moi_term = HpTerm(hpo_id="HP:0000007", label="Autosomal recessive inheritance") diff --git a/test/test_age_isoformater.py b/test/test_age_isoformater.py index 25fb7a3f..9632831b 100644 --- a/test/test_age_isoformater.py +++ b/test/test_age_isoformater.py @@ -1,66 +1,54 @@ -import unittest +import pytest from pyphetools.creation import AgeIsoFormater +class TestAgeIsoFormater: + + @pytest.mark.parametrize( + 'year, month, day, iso', + [ + ( 2, 3, 5, "P2Y3M5D"), + ( 29, 5, 25, "P29Y5M25D"), + ( 99,1,24,"P99Y1M24D"), + ] + ) + def test_ymd( + self, + year: int, + month: int, + day: int, + iso:str, + ): + iso_age = AgeIsoFormater.to_string(y=year, m=month, d=day) + assert iso == iso_age + + @pytest.mark.parametrize( + 'month, iso', + [ + ( 5, "P5M"), + ( 0.5,"P15D"), + (0.8, "P24D"), + (11, "P11M"), + (12, "P1Y"), + (16, "P1Y4M"), + ("n.a.", "NOT_PROVIDED"), + (None, "NOT_PROVIDED"), + (float("nan"), "NOT_PROVIDED"), + (0, "P0D") + ] + ) + def test_numerical_month( + self, + month: float, + iso: str + ): + iso_age = AgeIsoFormater.from_numerical_month(month=month) + assert iso == iso_age + + -class TestAgeIsoFormater(unittest.TestCase): - def test_basic1(self): - iso_age = AgeIsoFormater.to_string(y=2, m=3, d=5) - self.assertEqual("P2Y3M5D", iso_age) - - def test_basic2(self): - """ - test that 13 months are normalized to 1 year 1 month - """ - iso_age = AgeIsoFormater.to_string(y=42, m=13, d=5) - self.assertEqual("P43Y1M5D", iso_age) - - def test_5m(self): - iso_age = AgeIsoFormater.from_numerical_month(5) - self.assertEqual("P5M", iso_age) - - def test_15d(self): - iso_age = AgeIsoFormater.from_numerical_month(0.5) - self.assertEqual("P15D", iso_age) - - def test_24d(self): - iso_age = AgeIsoFormater.from_numerical_month(0.8) - self.assertEqual("P24D", iso_age) - - def test_12m(self): - iso_age = AgeIsoFormater.from_numerical_month(12) - self.assertEqual("P1Y", iso_age) - - def test_16m(self): - iso_age = AgeIsoFormater.from_numerical_month(16) - self.assertEqual("P1Y4M", iso_age) - - def test_na(self): - """ - Test we return NOT_PROVIDED (from the Contants class) if we cannot parse the cell contents - """ - iso_age = AgeIsoFormater.from_numerical_month("n.a.") - self.assertEqual("NOT_PROVIDED", iso_age) - - def test_none(self): - """ - Test we return NOT_PROVIDED (from the Contants class) if we cannot parse the cell contents - """ - iso_age = AgeIsoFormater.from_numerical_month(None) - self.assertEqual("NOT_PROVIDED", iso_age) - - def test_nan(self): - """ - Test we return NOT_PROVIDED (from the Contants class) if we cannot parse the cell contents - """ - iso_age = AgeIsoFormater.from_numerical_month(float("nan")) - self.assertEqual("NOT_PROVIDED", iso_age) - - def test_newborn(self): - iso_age = AgeIsoFormater.from_numerical_month(0) - self.assertEqual("P0D", iso_age) diff --git a/test/test_moi.py b/test/test_moi.py new file mode 100644 index 00000000..f8088d34 --- /dev/null +++ b/test/test_moi.py @@ -0,0 +1,59 @@ +import pytest + +from pyphetools.creation import Moi + + +class TestMoi: + + @pytest.mark.parametrize( + 'moi, hpterm_id, hpterm_label', + [ + ( + Moi.AD, + 'HP:0000006', + 'Autosomal dominant inheritance', + ), + ( + Moi.AR, + 'HP:0000007', + 'Autosomal recessive inheritance', + ), + ( + Moi.XLI, + 'HP:0001417', + 'X-linked inheritance' + ), + ( + Moi.XLR, + 'HP:0001419', + 'X-linked recessive inheritance' + ), + ( + Moi.XLD, + 'HP:0001423', + 'X-linked dominant inheritance' + ), + ( + Moi.MITO, + 'HP:0001427', + 'Mitochondrial inheritance' + ), + ( + Moi.YLI, + 'HP:0001450', + 'Y-linked inheritance' + ) + + ] + ) + def test_moi( + self, + moi: Moi, + hpterm_id: str, + hpterm_label: str, + ): + hpterm = moi.to_HPO() + + assert hpterm_id == hpterm.id + assert hpterm_label == hpterm.label +