Skip to content

Commit

Permalink
fixed age parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Mar 26, 2024
1 parent 7ec2ce8 commit 3e437dc
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/pyphetools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from . import visualization
from . import validation

__version__ = "0.9.67"
__version__ = "0.9.69"

__all__ = [
"creation",
Expand Down
50 changes: 40 additions & 10 deletions src/pyphetools/creation/age_column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class Iso8601AgeColumnMapper(AgeColumnMapper):
def __init__(self, column_name) -> None:
super().__init__(column_name=column_name)

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
contents = self._clean_contents(cell_contents=cell_contents)
match = re.search(ISO8601_REGEX, contents)
if match:
Expand All @@ -157,22 +157,25 @@ class YearMonthAgeColumnMapper(AgeColumnMapper):
def __init__(self, column_name) -> None:
super().__init__(column_name=column_name)

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
contents = self._clean_contents(cell_contents=cell_contents)
try:
match = re.search(YEAR_AND_MONTH_REGEX, contents)
if match:
years = int(match.group(1))
months = int(match.group(2))
return AgeIsoFormater.to_string(y=years, m=months)
age_string = f"P{years}Y{months}M"
return IsoAge(y=years, m=months, age_string=age_string)
match = re.search(YEAR_REGEX, contents)
if match:
years = int(match.group(1))
return AgeIsoFormater.to_string(y=years)
age_string = f"P{years}Y"
return IsoAge(y=years, age_string=age_string)
match = re.search(MONTH_REGEX, contents)
if match:
months = int(match.group(1))
return AgeIsoFormater.to_string(m=months)
age_string = f"P{months}M"
return IsoAge(m=months, age_string=age_string)
except ValueError as verr:
print(f"Could not parse {cell_contents} as year/month: {verr}")
return NoneAge(contents)
Expand All @@ -184,18 +187,45 @@ class MonthAgeColumnMapper(AgeColumnMapper):
def __init__(self, column_name) -> None:
super().__init__(column_name=column_name)

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
# assume month encoded by integer or float.
contents = self._clean_contents(cell_contents=cell_contents)
return AgeIsoFormater.from_numerical_month(contents)
month = str(contents)
if month.isdigit():
full_months = int(month)
days = 0
age_string = f"P{full_months}M"
if full_months < 12:
return IsoAge(m=full_months, age_string=age_string)
elif full_months == 12:
return IsoAge(y=1, age_string="P1Y")
else:
raise ValueError(f"Improperly coded months : {month} -- should be at most 12.")
elif month.replace('.', '', 1).isdigit() and month.count('.') < 2:
# a float such as 0.9 (months)
months = float(month)
avg_num_days_in_month = 30.437
floor_months = math.floor(months)
if floor_months == 0.0:
days = int(months * avg_num_days_in_month)
full_months = 0
age_string = f"P{days}D"
return IsoAge(d=days, age_string=age_string)
else:
remainder = months - floor_months
full_months = int(months - remainder)
days = int(remainder * avg_num_days_in_month)
age_string = f"P{full_months}M{days}D"
return IsoAge(m=full_months, d=days, age_string=age_string)



class YearAgeColumnMapper(AgeColumnMapper):

def __init__(self, column_name) -> None:
super().__init__(column_name=column_name)

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
"""
Extract an iso8601 string for age recorded as a year (either an int such as 4 or a float such as 4.25 for P4Y3M)
:param age: an int representing years or a float such as 2.5 for two and a half years
Expand Down Expand Up @@ -233,7 +263,7 @@ def __init__(self, column_name:str, string_to_iso_d) -> None:
super().__init__(column_name=column_name)
self._string_to_iso_d = string_to_iso_d

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
if cell_contents not in self._string_to_iso_d:
print(f"[WARNING] Could not find \"{cell_contents}\" in custom dictionary")
return NoneAge(cell_contents)
Expand Down Expand Up @@ -263,7 +293,7 @@ class HpoAgeColumnMapper(AgeColumnMapper):
def __init__(self, column_name:str) -> None:
super().__init__(column_name=column_name)

def map_cell(self, cell_contents) -> str:
def map_cell(self, cell_contents) -> PyPheToolsAge:
contents = self._clean_contents(cell_contents=cell_contents)
if contents in HPO_ONSET_TERMS:
return HpoAge(hpo_onset_label=contents)
Expand Down
2 changes: 0 additions & 2 deletions src/pyphetools/creation/cohort_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ def get_individuals(self) -> List[Individual]:
if not self._df.index.name in self._df.columns:
df = self._df.reset_index()
individuals = []
age_onset_column_name = self._age_of_onset_mapper.get_column_name()
age_last_encounter_column_name = self._age_at_last_encounter_mapper.get_column_name()
sex_column_name = self._sex_mapper.get_column_name()
if self._variant_mapper is None:
variant_colname = None
Expand Down
22 changes: 15 additions & 7 deletions src/pyphetools/creation/create_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pandas as pd
from collections import defaultdict
from .hpo_parser import HpoParser
from typing import List
import hpotk
from .case_template_encoder import REQUIRED_H1_FIELDS, REQUIRED_H2_FIELDS

class TemplateCreator:
Expand All @@ -12,7 +14,8 @@ def __init__(self, hp_json:str) -> None:
parser = HpoParser(hpo_json_file=hp_json)
self._hpo_cr = parser.get_hpo_concept_recognizer()
self._hpo_ontology = parser.get_ontology()
self._hp_term_list = list()
self._all_added_hp_term_set = set()



def add_seed_terms(self, text:str) -> None:
Expand All @@ -24,17 +27,20 @@ def add_seed_terms(self, text:str) -> None:
:param text: free text that contains HPO term labels to be mined
:type text: str
"""
hp_terms = set()
for line in text.split("\n"):
hpo_term_list = self._hpo_cr.parse_cell(line)
for hpt in hpo_term_list:
hp_terms.add(hpt)
self._all_added_hp_term_set.add(hpt)


def arrange_terms(self) -> List[hpotk.model.TermId]:
hp_term_list = list()
## Arrange hp_terms so that all terms that belong to a given top level term go together
PHENO_ROOT_TERM_ID = "HP:0000118"
top_level_term_ids = self._hpo_ontology.graph.get_children(PHENO_ROOT_TERM_ID, False)
top_level_term_ids = list(top_level_term_ids)
top_level_d = defaultdict(list)
for hpt in hp_terms:
for hpt in self._all_added_hp_term_set:
found= False
for tlt in top_level_term_ids:
if self._hpo_ontology.graph.is_descendant_of(hpt.id,tlt):
Expand All @@ -46,8 +52,9 @@ def add_seed_terms(self, text:str) -> None:
# Now the terms can be arrange by top level ancestor, which will make it easier to enter
# in the Excel sheet
for tlt, hpt_list in top_level_d.items():
self._hp_term_list.extend(hpt_list)
print(f"[INFO] Add {len(self._hp_term_list)} HPO terms to template.")
hp_term_list.extend(hpt_list)
print(f"[INFO] Add {len(hp_term_list)} HPO terms to template.")
return hp_term_list


def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_symbol:str, transcript:str):
Expand All @@ -64,7 +71,8 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
if len(H1_Headers) != len(H2_Headers):
raise ValueError("Header lists must have same length")
EMPTY_STRING = ""
for hpt in self._hp_term_list:
hp_term_list = self.arrange_terms()
for hpt in hp_term_list:
H1_Headers.append(hpt.label)
H2_Headers.append(hpt.id)
df = pd.DataFrame(columns=H1_Headers)
Expand Down
8 changes: 4 additions & 4 deletions src/pyphetools/creation/individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def __init__(self,
self._sex = PPKt.Sex.UNKNOWN_SEX
else:
self._sex = sex
#if not isinstance(age_of_onset, PyPheToolsAge):
# raise ValueError(f"age_of_onset argument must be PyPheToolsAge but was {type(age_of_onset)}")
if not isinstance(age_of_onset, PyPheToolsAge):
raise ValueError(f"age_of_onset argument must be PyPheToolsAge but was {type(age_of_onset)}")
self._age_of_onset = age_of_onset
#if not isinstance(age_at_last_encounter, PyPheToolsAge):
# raise ValueError(f"age_at_last_encounter argument must be PyPheToolsAge but was {type(age_of_onset)}")
if not isinstance(age_at_last_encounter, PyPheToolsAge):
raise ValueError(f"age_at_last_encounter argument must be PyPheToolsAge but was {type(age_of_onset)}")
self._age_at_last_encounter = age_at_last_encounter
self._vital_status = vital_status
if hpo_terms is None:
Expand Down
2 changes: 0 additions & 2 deletions src/pyphetools/visualization/individual_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ class IndividualTable:
def __init__(self, individual_list:List[Individual],
metadata:MetaData=None) -> None:
"""
:param phenopacket_list: List of GA4GH phenopackets to be displayed
:type phenopacket_list: List[PPKt.Phenopacket]
:param individual_list: List of Indidivual objects to be displayed
:type individual_list: List[Individual]
:param metadata: metadata about the individuals
Expand Down
31 changes: 20 additions & 11 deletions test/test_age_column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,31 @@ def test_year_month_both_1(self):
ageMapper = AgeColumnMapper.by_year_and_month(column_name=TEST_COLUMN)
age_string = "14 y 8 m"
age_iso = ageMapper.map_cell(age_string)
self.assertEqual("P14Y8M", age_iso)
self.assertEqual("P14Y8M", age_iso.age_string)

def test_year_month_both_2(self):
ageMapper = AgeColumnMapper.by_year_and_month(column_name=TEST_COLUMN)
age_string = "7 y 6 m"
age_iso = ageMapper.map_cell(age_string)
self.assertEqual("P7Y6M", age_iso)
self.assertEqual("P7Y6M", age_iso.age_string)

def test_year_month_both_3(self):
ageMapper = AgeColumnMapper.by_year_and_month(column_name=TEST_COLUMN)
age_string = "7y6m"
age_iso = ageMapper.map_cell(age_string)
self.assertEqual("P7Y6M", age_iso)
self.assertEqual("P7Y6M", age_iso.age_string)

def test_year_month_just_year_1(self):
ageMapper = AgeColumnMapper.by_year_and_month(column_name=TEST_COLUMN)
age_string = "7 y"
age_iso = ageMapper.map_cell(age_string)
self.assertEqual("P7Y", age_iso)
self.assertEqual("P7Y", age_iso.age_string)

def test_year_month_just_month_1(self):
ageMapper = AgeColumnMapper.by_year_and_month(column_name=TEST_COLUMN)
age_string = "2 m"
age_iso = ageMapper.map_cell(age_string)
self.assertEqual("P2M", age_iso)
self.assertEqual("P2M", age_iso.age_string)

def test_int_or_float_regex(self):
int_or_float = r"(\d+)(\.\d+)?"
Expand Down Expand Up @@ -118,14 +118,23 @@ def test_custom_dictionary(self):
def test_by_month(self):
ageMapper = AgeColumnMapper.by_month(column_name=TEST_COLUMN)
age_iso = ageMapper.map_cell(5)
self.assertEqual("P5M", age_iso)
age_iso = ageMapper.map_cell("5")
self.assertEqual("P5M", age_iso)
self.assertEqual("P5M", age_iso.age_string)


def test_by_month2(self):
ageMapper = AgeColumnMapper.by_month(column_name=TEST_COLUMN)
age_iso = ageMapper.map_cell(0.5)
self.assertEqual("P15D", age_iso)
self.assertEqual("P15D", age_iso.age_string)

def test_by_month(self):
ageMapper = AgeColumnMapper.by_month(column_name=TEST_COLUMN)
age_iso = ageMapper.map_cell("5")
self.assertEqual("P5M", age_iso.age_string)
age_iso = ageMapper.map_cell(0.8)
self.assertEqual("P24D", age_iso)
self.assertEqual("P24D", age_iso.age_string)
age_iso = ageMapper.map_cell(12)
self.assertEqual("P1Y", age_iso)
self.assertEqual("P1Y", age_iso.age_string)




0 comments on commit 3e437dc

Please sign in to comment.