Merge pull request #103 from monarch-initiative/develop

Develop
monarch-initiative · Mar 29, 2024 · 872588e · 872588e
2 parents 7333bdb + 10b63af
commit 872588e
Show file tree

Hide file tree

Showing 32 changed files with 337 additions and 803 deletions.
diff --git a/docs/api/creation/case_encoder.md b/docs/api/creation/case_encoder.md
diff --git a/docs/api/creation/mixed_cohort_encoder.md b/docs/api/creation/mixed_cohort_encoder.md
diff --git a/docs/developers/installation.md b/docs/developers/installation.md
@@ -1,7 +1,7 @@
 # Installation of pyphetools
 
 
-pyphetools is available as a [PyPI package](https://pypi.org/project/pyphetools/){:target="\_blank"}. 
+pyphetools is available as a [PyPI package](https://pypi.org/project/pyphetools/){:target="\_blank"}.
 
 Most users should install the latest version (the following example creates a virtual environment).
 Note that depending on your system it will be necessary to update pip to be able to install pyphetools.
@@ -23,7 +23,7 @@ To use the kernel in notebooks, enter the following
 
 ```bash title="installing jupyter and running pyphetools in a notebook"
 pip install jupyter ipykernel
-python -m ipykernel install --name "ppt_env" --display-name "ppt_env"
+python -m ipykernel install --user --name "ppt_env" --display-name "ppt_env"
 jupyter-notebook
 ```
 

diff --git a/docs/index.md b/docs/index.md
@@ -5,6 +5,18 @@ pyphetools is a Python package for creating [GA4GH phenopackets](https://phenopa
 from tabular data such as databases or supplemental files found in the medical literature.
 
 
+
+
+This documentation contains information about
+
+- How to use the [Excel template](user-guide/excel.md) to code clinical data
+- How to use [pyphetools classes](user-guide/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets
+- Information for [developers](developers/developers.md)
+- A description of the pyphetools [API](api/overview.md)
+
+
+See the following pages for more information:
+
 - **GitHub**:
 Source code is available at the [pyphetools GitHub repository](https://github.com/monarch-initiative/pyphetools){:target="\_blank"}.
 - **PyPI**:

diff --git a/docs/user-guide/choosing_column_mapper.md → docs/tabular/choosing_column_mapper.md b/docs/user-guide/choosing_column_mapper.md → docs/tabular/choosing_column_mapper.md
diff --git a/docs/user-guide/cohort_encoder.md → docs/tabular/cohort_encoder.md b/docs/user-guide/cohort_encoder.md → docs/tabular/cohort_encoder.md
diff --git a/docs/user-guide/constant_column_mapper.md → docs/tabular/constant_column_mapper.md b/docs/user-guide/constant_column_mapper.md → docs/tabular/constant_column_mapper.md
diff --git a/docs/user-guide/jupyter.md → docs/tabular/jupyter.md b/docs/user-guide/jupyter.md → docs/tabular/jupyter.md
@@ -1,30 +1,7 @@
 # Using pyphetools in a Jupyter notebook
 
-This option is intended for people who are comfortable using Python scripting, and is designed to import tabular data such as is commonly found in the supplemental files of medical publications about cohorts of individuals diagnosed with a certain disease. See also the instructions for using an [Excel template](template.md) for entering data with a minimum of scripting.
-
-
-The best way to get a feeling for how to work with pyphetools is to examine the various notenooks in the
-[phenopacket-store](https://github.com/monarch-initiative/phenopacket-store){:target="\_blank"} repository.
-
-This tutorial provides some general tips for how to use the library. The library is intended to be used in a Jupyter notebook environment so that users can check intermediate results.
-There are many ways of setting this up, but here is one that we often use.
-
-
-
-```bash title="installing jupyter and running pyphetools in a notebook"
-python3 -m venv your_env
-source your_env/bin/activiate
-pip install --upgrade pip
-pip install pyphetools
-pip install jupyter ipykernel
-python3 -m ipykernel install --name your_env --user
-jupyter-notebook
-```
-
-The virtual environment (here *your_env*) can be named as desired. The last line opens a Jupyter Notebook page;
-create a new Notebook and choose the kernel called *your_env* (or whatever you called it).
-
 
+This page provides an overview of how to structure a Jupyter notebook to import tabular data. We recommend importing data for one disease at a time.
 
 ### Importing necessary packages
 
@@ -48,7 +25,7 @@ from IPython.display import display, HTML
 
 ### Import the Human Phenotype Ontology (HPO) file
 
-It is useful to import the HPO file and create the `MetaData` object 
+It is useful to import the HPO file and create the `MetaData` object
 (which records your `ORCID <https://orcid.org/>`_ id and the version of the HPO used) in one step.
 
 First, we load the latest HPO file using HPO toolkit:
@@ -58,7 +35,7 @@ ontostore = hpotk.configure_ontology_store()
 hpo = ontostore.load_hpo()
 ```
 
-Now, we can create the `MetaData`: 
+Now, we can create the `MetaData`:
 
 ```python title="Configure MetaData"
 PMID = "PMID:16783569"

diff --git a/docs/user-guide/option_column_mapper.md → docs/tabular/option_column_mapper.md b/docs/user-guide/option_column_mapper.md → docs/tabular/option_column_mapper.md
diff --git a/docs/tabular/overview.md b/docs/tabular/overview.md
@@ -0,0 +1,34 @@
+# Encoding tabular data with pyphetools scripts
+
+This option is intended for people who are comfortable using Python scripting, and is designed to import tabular data such as is commonly found in the supplemental files of medical publications about cohorts of individuals diagnosed with a certain disease. See also the instructions for using an [Excel template](template.md) for entering data with a minimum of scripting.
+
+
+The best way to get a feeling for how to work with pyphetools is to examine the various notenooks in the
+[phenopacket-store](https://github.com/monarch-initiative/phenopacket-store){:target="\_blank"} repository.
+
+This tutorial provides some general tips for how to use the library. The library is intended to be used in a Jupyter notebook environment so that users can check intermediate results.
+There are many ways of setting this up, but here is one that we often use.
+
+
+The typical use case for using pyphetools in this way is to ingest complicated tables that would be too difficult or unweildly to import using the Excel template.
+
+
+## Setting up the Jupyter environment
+
+We recommend developing scripts using a Jupyter notebook so that parsing results can be checked.
+
+There are many ways of setting up Jupyter, all of which should work with pyphetools. We use the following approach.
+
+```bash title="installing jupyter and running pyphetools in a notebook"
+python3 -m venv your_env
+source your_env/bin/activiate
+pip install --upgrade pip
+pip install pyphetools
+pip install jupyter ipykernel
+python3 -m ipykernel install --name your_env --user
+jupyter-notebook
+```
+
+The virtual environment (here *your_env*) can be named as desired. The last line opens a Jupyter Notebook page;
+create a new Notebook and choose the kernel called *your_env* (or whatever you called it).
+
diff --git a/docs/user-guide/simple_column_mapper.md → docs/tabular/simple_column_mapper.md b/docs/user-guide/simple_column_mapper.md → docs/tabular/simple_column_mapper.md
diff --git a/docs/user-guide/threshold_column_mapper.md → docs/tabular/threshold_column_mapper.md b/docs/user-guide/threshold_column_mapper.md → docs/tabular/threshold_column_mapper.md
diff --git a/docs/user-guide/validation.md → docs/tabular/validation.md b/docs/user-guide/validation.md → docs/tabular/validation.md
diff --git a/docs/user-guide/variant_column_mapper.md → docs/tabular/variant_column_mapper.md b/docs/user-guide/variant_column_mapper.md → docs/tabular/variant_column_mapper.md
diff --git a/docs/user-guide/visualization.md → docs/tabular/visualization.md b/docs/user-guide/visualization.md → docs/tabular/visualization.md
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -37,29 +37,29 @@ nav:
     - 'user-guide/python_notebook.md'
     - 'user-guide/tips_for_curation.md'
     - 'user-guide/variant_notation.md'
+  - Coding tabular data with Python scripts:
+      - Overview: 'tabular/overview.md'
+      - Jupyter notebooks: 'tabular/jupyter.md'
+      - Column mappers:
+        - Choosing a column mapper: 'tabular/choosing_column_mapper.md'
+        - Simple column mapper: 'tabular/simple_column_mapper.md'
+        - Constant column mapper: 'tabular/constant_column_mapper.md'
+        - Option column mapper: 'tabular/option_column_mapper.md'
+        - Threshold column mapper: 'tabular/threshold_column_mapper.md'
+      - Variant column mapper: 'tabular/variant_column_mapper.md'
+      - Cohort encoder: 'tabular/cohort_encoder.md'
+      - Validation: 'tabular/validation.md'
+      - Visualization: 'tabular/visualization.md'
   - Developers:
     - For developers: 'developers/developers.md'
     - Installation: 'developers/installation.md'
     - HPOA files: 'developers/hpoa_editing.md'
     - Internal: 'developers/internal.md'
-    - Custom Python scripts:
-      - Overview: 'user-guide/jupyter.md'
-      - Column mappers:
-        - Choosing a column mapper: 'user-guide/choosing_column_mapper.md'
-        - Simple column mapper: 'user-guide/simple_column_mapper.md'
-        - Constant column mapper: 'user-guide/constant_column_mapper.md'
-        - Option column mapper: 'user-guide/option_column_mapper.md'
-        - Threshold column mapper: 'user-guide/threshold_column_mapper.md'
-      - Variant column mapper: 'user-guide/variant_column_mapper.md'
-      - Cohort encoder: 'user-guide/cohort_encoder.md'
-      - Validation: 'user-guide/validation.md'
-      - Visualization: 'user-guide/visualization.md'
   - API:
     - Overview: 'api/overview.md'
     - creation:
       - overview: 'api/creation.md'
       - AgeColumnMapper: "api/creation/age_column_mapper.md"
-      - CaseEncoder: "api/creation/case_encoder.md"
       - CaseTemplateEncoder: "api/creation/case_template_encoder.md"
       - Citation:  "api/creation/citation.md"
       - CohortEncoder: "api/creation/cohort_encoder.md"
@@ -75,7 +75,6 @@ nav:
       - Individual: "api/creation/individual.md"
       - IsoAge: "api/creation/iso_age.md"
       - MetaData: "api/creation/metadata.md"
-      - MixedCohortEncoder: "api/creation/mixed_cohort_encoder.md"
       - OptionColumnMapper: "api/creation/option_column_mapper.md"
       - SexColumnMapper: "api/creation/sex_column_mapper.md"
       - SimpleColumnMapper: "api/creation/simple_column_mapper.md"

diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py
@@ -4,7 +4,7 @@
 from . import visualization
 from . import validation
 
-__version__ = "0.9.66"
+__version__ = "0.9.72"
 
 __all__ = [
     "creation",

diff --git a/src/pyphetools/creation/__init__.py b/src/pyphetools/creation/__init__.py
@@ -2,7 +2,6 @@
 from .age_isoformater import AgeIsoFormater
 from .age_of_death_mapper import AgeOfDeathColumnMapper
 from .allelic_requirement import AllelicRequirement
-from .case_encoder import CaseEncoder
 from .case_template_encoder import CaseTemplateEncoder
 from .citation import Citation
 from .cohort_encoder import CohortEncoder
@@ -19,11 +18,11 @@
 from .import_template import TemplateImporter
 from .individual import Individual
 from .metadata import MetaData
-from .mixed_cohort_encoder import MixedCohortEncoder
 from .option_column_mapper import OptionColumnMapper
 from .pyphetools_age import PyPheToolsAge, IsoAge, HpoAge, GestationalAge, HPO_ONSET_TERMS
 from .sex_column_mapper import SexColumnMapper
-from .simple_column_mapper import SimpleColumnMapper, SimpleColumnMapperGenerator
+from .simple_column_mapper import SimpleColumnMapper
+from .scm_generator import SimpleColumnMapperGenerator
 from .structural_variant import StructuralVariant
 from .thresholded_column_mapper import ThresholdedColumnMapper
 from .thresholder import Thresholder
@@ -39,7 +38,6 @@
     "AgeIsoFormater",
     "AgeOfDeathColumnMapper",
     "AllelicRequirement",
-    "CaseEncoder",
     "CaseTemplateEncoder",
     "Citation",
     "CohortEncoder",
@@ -56,7 +54,6 @@
     "HpTermBuilder",
     "Individual",
     "MetaData",
-    "MixedCohortEncoder",
     "OptionColumnMapper",
     "PyPheToolsAge", "IsoAge", "HpoAge", "GestationalAge", "HPO_ONSET_TERMS",
     "SexColumnMapper",

diff --git a/src/pyphetools/creation/age_column_mapper.py b/src/pyphetools/creation/age_column_mapper.py
@@ -140,7 +140,7 @@ class Iso8601AgeColumnMapper(AgeColumnMapper):
     def __init__(self, column_name) -> None:
         super().__init__(column_name=column_name)
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         contents = self._clean_contents(cell_contents=cell_contents)
         match = re.search(ISO8601_REGEX, contents)
         if match:
@@ -157,22 +157,25 @@ class YearMonthAgeColumnMapper(AgeColumnMapper):
     def __init__(self, column_name) -> None:
         super().__init__(column_name=column_name)
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         contents = self._clean_contents(cell_contents=cell_contents)
         try:
             match = re.search(YEAR_AND_MONTH_REGEX, contents)
             if match:
                 years = int(match.group(1))
                 months = int(match.group(2))
-                return AgeIsoFormater.to_string(y=years, m=months)
+                age_string = f"P{years}Y{months}M"
+                return IsoAge(y=years, m=months, age_string=age_string)
             match = re.search(YEAR_REGEX, contents)
             if match:
                 years = int(match.group(1))
-                return AgeIsoFormater.to_string(y=years)
+                age_string = f"P{years}Y"
+                return IsoAge(y=years, age_string=age_string)
             match = re.search(MONTH_REGEX, contents)
             if match:
                 months = int(match.group(1))
-                return AgeIsoFormater.to_string(m=months)
+                age_string = f"P{months}M"
+                return IsoAge(m=months, age_string=age_string)
         except ValueError as verr:
             print(f"Could not parse {cell_contents} as year/month: {verr}")
             return NoneAge(contents)
@@ -184,18 +187,42 @@ class MonthAgeColumnMapper(AgeColumnMapper):
     def __init__(self, column_name) -> None:
         super().__init__(column_name=column_name)
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         # assume month encoded by integer or float.
         contents = self._clean_contents(cell_contents=cell_contents)
-        return AgeIsoFormater.from_numerical_month(contents)
+        month = str(contents)
+        if month.isdigit():
+            full_months = int(month)
+            days = 0
+            age_string = AgeIsoFormater.from_numerical_month(full_months)
+            return IsoAge(m=full_months, age_string=age_string)
+        elif month.replace('.', '', 1).isdigit() and month.count('.') < 2:
+            # a float such as 0.9 (months)
+            months = float(month)
+            avg_num_days_in_month = 30.437
+            floor_months = math.floor(months)
+            if floor_months == 0.0:
+                days = int(months * avg_num_days_in_month)
+                full_months = 0
+                age_string = f"P{days}D"
+                return IsoAge(d=days, age_string=age_string)
+            else:
+                remainder = months - floor_months
+                full_months = int(months - remainder)
+                days = int(remainder * avg_num_days_in_month)
+                age_string = f"P{full_months}M{days}D"
+                return IsoAge(m=full_months, d=days, age_string=age_string)
+        else:
+            return NoneAge("na")
+
 
 
 class YearAgeColumnMapper(AgeColumnMapper):
 
     def __init__(self, column_name) -> None:
         super().__init__(column_name=column_name)
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         """
         Extract an iso8601 string for age recorded as a year (either an int such as 4 or a float such as 4.25 for P4Y3M)
         :param age: an int representing years or a float such as 2.5 for two and a half years
@@ -233,7 +260,7 @@ def __init__(self, column_name:str, string_to_iso_d) -> None:
         super().__init__(column_name=column_name)
         self._string_to_iso_d = string_to_iso_d
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         if cell_contents not in self._string_to_iso_d:
             print(f"[WARNING] Could not find \"{cell_contents}\" in custom dictionary")
             return NoneAge(cell_contents)
@@ -263,7 +290,7 @@ class HpoAgeColumnMapper(AgeColumnMapper):
     def __init__(self, column_name:str) -> None:
         super().__init__(column_name=column_name)
 
-    def map_cell(self, cell_contents) -> str:
+    def map_cell(self, cell_contents) -> PyPheToolsAge:
         contents = self._clean_contents(cell_contents=cell_contents)
         if contents in HPO_ONSET_TERMS:
             return HpoAge(hpo_onset_label=contents)