diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py index 135f170..cf2159b 100644 --- a/src/pyphetools/__init__.py +++ b/src/pyphetools/__init__.py @@ -5,7 +5,7 @@ from . import validation -__version__ = "0.9.109" +__version__ = "0.9.110" __all__ = [ diff --git a/src/pyphetools/creation/import_template.py b/src/pyphetools/creation/import_template.py index ba2c221..f64f324 100644 --- a/src/pyphetools/creation/import_template.py +++ b/src/pyphetools/creation/import_template.py @@ -14,9 +14,9 @@ class TemplateImporter: ORCID_regex = r"^\d{4}-\d{4}-\d{4}-\d{4}$" - def __init__(self,template:str, - created_by:str, - hp_json:str=None) -> None: + def __init__(self, template: str, + created_by: str, + hp_json: str = None) -> None: """Constructor :param template: path to Excel template file @@ -46,7 +46,7 @@ def __init__(self,template:str, self._moi_list = list() @staticmethod - def _get_data_from_template(df:pd.DataFrame) -> typing.Tuple[str,str,str,str,str]: + def _get_data_from_template(df: pd.DataFrame) -> typing.Tuple[str, str, str, str, str]: """Check that the template (dataframe) has the columns "HGNC_id", "gene_symbol", "transcript", If each row has the same value for these columns, then the template is valid @@ -62,7 +62,7 @@ def _get_data_from_template(df:pd.DataFrame) -> typing.Tuple[str,str,str,str,str if item not in df.columns: raise ValueError(f"Invalid template -- could not find the \"{item}\" column") ## We need to skip the first row (second row of excel file), which has the datatypes - for _, row in df.iloc[1: , :].iterrows(): + for _, row in df.iloc[1:, :].iterrows(): contents_d["disease_id"].add(row["disease_id"]) contents_d["disease_label"].add(row["disease_label"]) contents_d["HGNC_id"].add(row["HGNC_id"]) @@ -84,7 +84,7 @@ def _get_data_from_template(df:pd.DataFrame) -> typing.Tuple[str,str,str,str,str return disease_id, disease_label, HGNC_id, gene_symbol, transcript @staticmethod - def _get_allelic_requirement(df:pd.DataFrame): + def _get_allelic_requirement(df: pd.DataFrame): """Determine allelic requirement Note that we always expect the column allele_1 to have content. If each row of allele_2 is "na", then the allelic requirement is MONO_ALLELIC @@ -99,7 +99,7 @@ def _get_allelic_requirement(df:pd.DataFrame): from pyphetools.creation import AllelicRequirement total_row_count = 0 total_allele_2_na_count = 0 - for _, row in df.iloc[1: , :].iterrows(): + for _, row in df.iloc[1:, :].iterrows(): a1 = row["allele_1"] a2 = row["allele_2"] if a1 == "na": @@ -116,15 +116,13 @@ def _get_allelic_requirement(df:pd.DataFrame): else: raise ValueError(f"Error: {total_allele_2_na_count} rows with two alleles but {total_row_count} total rows") - - def import_phenopackets_from_template(self, - deletions:typing.Set[str]=set(), - duplications:typing.Set[str]=set(), - inversions:typing.Set[str]=set(), - translocations:typing.Set[str]=set(), - hemizygous:bool=False, - leniant_MOI:bool=False): + deletions: typing.Set[str] = set(), + duplications: typing.Set[str] = set(), + inversions: typing.Set[str] = set(), + translocations: typing.Set[str] = set(), + hemizygous: bool = False, + leniant_MOI: bool = False): """Import the data from an Excel template and create a collection of Phenopackets This method writes the individuals as Phenopackets to file and also returns Individuals and the CValidator. ToDo -- refactor to avoid side effects. @@ -147,7 +145,7 @@ def import_phenopackets_from_template(self, :returns: tuple with individual list and CohortValidator that optionally can be used to display in a notebook :rtype: typing.Tuple[typing.List[pyphetools.creation.Individual], pyphetools.validation.CohortValidator] """ - from pyphetools.creation import HpoParser + from pyphetools.creation import HpoParser from pyphetools.creation import CaseTemplateEncoder from pyphetools.creation import VariantManager from pyphetools.validation import CohortValidator @@ -161,11 +159,11 @@ def import_phenopackets_from_template(self, disease_id, disease_label, HGNC_id, gene_symbol, transcript = TemplateImporter._get_data_from_template(df) print(f"Importing {disease_id}, {disease_label}, {HGNC_id}, {gene_symbol}, {transcript}") vman = VariantManager(df=df, individual_column_name="individual_id", - allele_1_column_name="allele_1", - allele_2_column_name="allele_2", - gene_id=HGNC_id, - gene_symbol=gene_symbol, - transcript=transcript) + allele_1_column_name="allele_1", + allele_2_column_name="allele_2", + gene_id=HGNC_id, + gene_symbol=gene_symbol, + transcript=transcript) if len(deletions) > 0: vman.code_as_chromosomal_deletion(deletions) if len(duplications) > 0: @@ -177,12 +175,18 @@ def import_phenopackets_from_template(self, if vman.has_unmapped_alleles(): mapped = vman.get_mapped_allele_count() print(f"We were able to map {mapped} alleles.") - print("The following alleles could not be mapped. Either there is an error or the variants are structural and require special treatment (see documentation)") + print("The following alleles could not be mapped.") + print("Either there is an error or the variants are structural and require special treatment.") + struct_vars = list() for uma in vman.get_unmapped_alleles(): if uma.startswith("c."): print(f"- {transcript}:{uma}") else: - print(f"- {uma} (may require coding as structural variant)") + struct_vars.append(uma) + if len(struct_vars) > 0: + print("The following may require coding as a structural variant)") + for struct_var in struct_vars: + print(struct_var) print("Fix this error and then try again!") sys.exit(1) vman.add_variants_to_individuals(individuals, hemizygous=hemizygous) @@ -193,17 +197,16 @@ def import_phenopackets_from_template(self, cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1) else: all_req = TemplateImporter._get_allelic_requirement(df) - cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, allelic_requirement=all_req) + cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, + allelic_requirement=all_req) if cvalidator.n_removed_individuals() > 0: print(f"Removed {cvalidator.n_removed_individuals()} individuals with unfixable errors") ef_individuals = cvalidator.get_error_free_individual_list() encoder.output_individuals_as_phenopackets(individual_list=ef_individuals) return individuals, cvalidator - - @staticmethod - def check_disease_entries(ppkt_list:typing.List[PPKt.Phenopacket]) -> None: + def check_disease_entries(ppkt_list: typing.List[PPKt.Phenopacket]) -> None: disease_count_d = defaultdict(int) for ppkt in ppkt_list: # Each phenopacket must have one disease @@ -245,12 +248,11 @@ def filter_diseases(disease_id, ppkt_list): print(f"[INFO] Extracted {(len(target_list))} from {(len(ppkt_list))} phenopackets with {disease_id}\n") return target_list - def create_hpoa_from_phenopackets(self, - pmid:str, - mode_of_inheritance:Moi, - ppkt_dir:str="phenopackets", - target:str=None) -> pd.DataFrame: + pmid: str, + mode_of_inheritance: Moi, + ppkt_dir: str = "phenopackets", + target: str = None) -> pd.DataFrame: """Create an HPO annotation (HPOA) file from the current cohort :param pmid: PubMed id for the mode of inheritance diff --git a/src/pyphetools/creation/variant_manager.py b/src/pyphetools/creation/variant_manager.py index 17f381d..540fc4a 100644 --- a/src/pyphetools/creation/variant_manager.py +++ b/src/pyphetools/creation/variant_manager.py @@ -7,6 +7,7 @@ from .variant_validator import VariantValidator from .structural_variant import StructuralVariant + def get_pickle_filename(name): """ provide standard filenaming convention. We pickle results from VariantValidator to avoid @@ -15,6 +16,7 @@ def get_pickle_filename(name): """ return f"variant_validator_cache_{name}.pickle" + def load_variant_pickle(name): """ Load the pickle file. If the file cannot be found, return None. If it is found, return the @@ -28,6 +30,7 @@ def load_variant_pickle(name): loaded_object = pickle.load(f) return loaded_object + def write_variant_pickle(name, my_object): """ Write a dictionary to pickled file @@ -38,7 +41,6 @@ def write_variant_pickle(name, my_object): pickle.dump(my_object, f) - class VariantManager: """This class is designed to extract Variant objects from a pandas DataFrame that represents the input data. It will work out of the box for dataframes created by the CaseTemplateEncoder, and can be adapted to work @@ -61,7 +63,7 @@ class VariantManager: gene_symbol=gnas_symbol, allele_1_column_name="allele_1") - See [variant_manager](https://monarch-initiative.github.io/pyphetools/api/creation/variant_manager/) for more information. + See also [variant_manager](https://monarch-initiative.github.io/pyphetools/api/creation/variant_manager/). :param df: DataFrame representing the input data :type df: pd.DataFrame @@ -73,22 +75,22 @@ class VariantManager: :type allele_1_column_name: str :param allele_2_column_name: name of the column with alleles (#2), optional :type allele_2_column_name: str - :param gene_symbol: Symbol of affectged gene (only required if chromosomal variants need to be coded) + :param gene_symbol: Symbol of affected gene (only required if chromosomal variants need to be coded) :type gene_symbol: str - :param gene_id: HGNC identifier of affectged gene (only required if chromosomal variants need to be coded) + :param gene_id: HGNC identifier of affected gene (only required if chromosomal variants need to be coded) :type gene_id: str """ def __init__(self, - df:pd.DataFrame, - individual_column_name:str, - transcript:str, - gene_symbol:str, - allele_1_column_name:str, - allele_2_column_name:str=None, - gene_id:str=None, - overwrite:bool=False - ): + df: pd.DataFrame, + individual_column_name: str, + transcript: str, + gene_symbol: str, + allele_1_column_name: str, + allele_2_column_name: str = None, + gene_id: str = None, + overwrite: bool = False + ): if not isinstance(df, pd.DataFrame): raise ValueError(f"The \"df\" argument must be a pandas DataFrame but was {type(df)}") if individual_column_name not in df.columns: @@ -96,7 +98,8 @@ def __init__(self, if allele_1_column_name not in df.columns: raise ValueError(f"The \"allele_1_column_name\" argument must be a a column in df (a pandas DataFrame)") if allele_2_column_name is not None and allele_2_column_name not in df.columns: - raise ValueError(f"If not None, the \"allele_2_column_name\" argument must be a a column in df (a pandas DataFrame)") + raise ValueError( + f"If not None, the \"allele_2_column_name\" argument must be a a column in df (a pandas DataFrame)") self._dataframe = df self._individual_column_name = individual_column_name self._transcript = transcript @@ -113,21 +116,20 @@ def __init__(self, self._pmid_column_name = None self._create_variant_d(overwrite) - def _format_pmid_id(self, identifier, pmid) -> str: if pmid is not None: return f"{pmid}_{identifier}" else: return identifier - def _get_identifier_with_pmid(self, row:pd.Series) -> str: + def _get_identifier_with_pmid(self, row: pd.Series) -> str: """Get an identifier such as PMID_33087723_A2 for a daa row with PMID:33087723 and identifier within that publication A2 Identifiers such as P1 are commonly used and there is a risk of a clash with collections of phenopackets from various papers. Therefore, we use an identifier such as PMID_33087723_A2 if we can find a PMID """ individual_id = row[self._individual_column_name] - individual_id = str(individual_id) # prevent automatic conversion into int for patient id 1, 2, 3 etc + individual_id = str(individual_id) # prevent automatic conversion into int for patient id 1, 2, 3 etc if self._pmid_column_name is not None: return self._format_pmid_id(identifier=individual_id, pmid=row[self._pmid_column_name]) else: @@ -153,7 +155,7 @@ def _create_variant_d(self, overwrite) -> None: variant_set = set() # we expect HGVS nomenclature. Everything else will be parsed as chromosomal vvalidator = VariantValidator(genome_build=genome_assembly, transcript=self._transcript) # The DataFrame has two header rows. - # For CaseTemplateEncoder, the second header row in effect is the first row of the DataFrame, so we drop it here. + # For CaseTemplateEncoder, the second header row is the first row of the DataFrame, so we drop it here. # For CaseTemplateEncoder, the second row will contain "str" in the second row of the PMID column # For other encoders, there may not be a "PMID" column, and if so it will not contain "CURIE" in the second row if "PMID" in self._dataframe.columns and self._dataframe.iloc[0]["PMID"] == "CURIE": @@ -166,7 +168,8 @@ def _create_variant_d(self, overwrite) -> None: if allele1.startswith(" ") or allele1.endswith(" "): raise ValueError(f"Malformed allele_1 description that starts/ends with whitespace: \"{allele1}\".") self._individual_to_alleles_d[individual_id].append(allele1) - if allele1.startswith("c."): + # take mRNA (c.) or noncoding RNA (n.) + if allele1.startswith("c.") or allele1.startswith("n."): variant_set.add(allele1) else: self._unmapped_alleles.add(allele1) @@ -179,7 +182,7 @@ def _create_variant_d(self, overwrite) -> None: self._individual_to_alleles_d[individual_id].append(allele2) if allele2.startswith(" ") or allele2.endswith(" "): raise ValueError(f"Malformed allele_2 description that starts/ends with whitespace: \"{allele2}\".") - if allele2.startswith("c."): + if allele2.startswith("c.") or allele2.startswith("n."): variant_set.add(allele2) else: self._unmapped_alleles.add(allele2) @@ -194,7 +197,7 @@ def _create_variant_d(self, overwrite) -> None: self._var_d[v] = var except Exception as e: print(f"[ERROR] Could not retrieve Variant Validator information for {v}: {str(e)}") - self._unmapped_alleles.add(v) # This allows us to use the chromosomal mappers. + self._unmapped_alleles.add(v) # This allows us to use the chromosomal mappers. write_variant_pickle(name=self._gene_symbol, my_object=self._var_d) def code_as_chromosomal_deletion(self, allele_set) -> None: @@ -207,11 +210,14 @@ def code_as_chromosomal_deletion(self, allele_set) -> None: for a in allele_set: if not a in self._unmapped_alleles: print(f"Could not find allele \"{a}\"") - raise ValueError("[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") + raise ValueError( + "[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") if self._gene_id is None or self._gene_symbol is None: - raise ValueError("[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") + raise ValueError( + "[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") for allele in allele_set: - var = StructuralVariant.chromosomal_deletion(cell_contents=allele, gene_symbol=self._gene_symbol, gene_id=self._gene_id) + var = StructuralVariant.chromosomal_deletion(cell_contents=allele, gene_symbol=self._gene_symbol, + gene_id=self._gene_id) self._unmapped_alleles.remove(allele) self._var_d[allele] = var @@ -222,11 +228,14 @@ def code_as_chromosomal_duplication(self, allele_set) -> None: """ # first check that all of the alleles are in self._unmapped_alleles if not allele_set.issubset(self._unmapped_alleles): - raise ValueError("[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") + raise ValueError( + "[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") if self._gene_id is None or self._gene_symbol is None: - raise ValueError("[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") + raise ValueError( + "[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") for allele in allele_set: - var = StructuralVariant.chromosomal_duplication(cell_contents=allele, gene_symbol=self._gene_symbol, gene_id=self._gene_id) + var = StructuralVariant.chromosomal_duplication(cell_contents=allele, gene_symbol=self._gene_symbol, + gene_id=self._gene_id) self._unmapped_alleles.remove(allele) self._var_d[allele] = var @@ -237,11 +246,14 @@ def code_as_chromosomal_inversion(self, allele_set) -> None: """ # first check that all of the alleles are in self._unmapped_alleles if not allele_set.issubset(self._unmapped_alleles): - raise ValueError("[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") + raise ValueError( + "[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") if self._gene_id is None or self._gene_symbol is None: - raise ValueError("[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") + raise ValueError( + "[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") for allele in allele_set: - var = StructuralVariant.chromosomal_inversion(cell_contents=allele, gene_symbol=self._gene_symbol, gene_id=self._gene_id) + var = StructuralVariant.chromosomal_inversion(cell_contents=allele, gene_symbol=self._gene_symbol, + gene_id=self._gene_id) self._unmapped_alleles.remove(allele) self._var_d[allele] = var @@ -252,15 +264,17 @@ def code_as_chromosomal_translocation(self, allele_set) -> None: """ # first check that all of the alleles are in self._unmapped_alleles if not allele_set.issubset(self._unmapped_alleles): - raise ValueError("[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") + raise ValueError( + "[ERROR] We can only map alleles that were passed to the constructor - are you trying to map \"new\" alleles?") if self._gene_id is None or self._gene_symbol is None: - raise ValueError("[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") + raise ValueError( + "[ERROR] We cannot use this method unless the gene ID (HGNC) and symbol were passed to the constructor") for allele in allele_set: - var = StructuralVariant.chromosomal_translocation(cell_contents=allele, gene_symbol=self._gene_symbol, gene_id=self._gene_id) + var = StructuralVariant.chromosomal_translocation(cell_contents=allele, gene_symbol=self._gene_symbol, + gene_id=self._gene_id) self._unmapped_alleles.remove(allele) self._var_d[allele] = var - def to_summary(self) -> pd.DataFrame: """ Create and return a DataFrame with current status of mapping efforts. The resulting dataframe can be used to @@ -290,8 +304,7 @@ def get_unmapped_alleles(self): def get_mapped_allele_count(self): return len(self._var_d) - - def add_variants_to_individuals(self, individual_list:List[Individual], hemizygous:bool=False): + def add_variants_to_individuals(self, individual_list: List[Individual], hemizygous: bool = False): """ Add Variant objects to individuals. Here, we use the map self._individual_to_alleles_d that relates the individual IDs to the allele strings in the original data, together with the @@ -304,7 +317,7 @@ def add_variants_to_individuals(self, individual_list:List[Individual], hemizygo """ if len(self._unmapped_alleles) > 0: raise ValueError(f"Need to map all allele strings before using this method but " - f"{len(self._unmapped_alleles)} were unmapped. Try variantManager.to_summary()") + f"{len(self._unmapped_alleles)} were unmapped. Try variantManager.to_summary()") for i in individual_list: citation = i.get_citation() if citation is not None: @@ -313,7 +326,8 @@ def add_variants_to_individuals(self, individual_list:List[Individual], hemizygo else: individual_id = self._format_pmid_id(identifier=i.id, pmid=None) if individual_id not in self._individual_to_alleles_d: - raise ValueError(f"Did not find {i.id} in our dictionary of individuals. Note that this function is intended to work with CaseTemplateEncoder") + raise ValueError( + f"Did not find {i.id} in our dictionary of individuals. Note that this function is intended to work with CaseTemplateEncoder") vlist = self._individual_to_alleles_d.get(individual_id) if len(vlist) == 1: # assume monoallelic @@ -364,8 +378,3 @@ def get_variant_d(self): :rtype: Dict[str, Variant] """ return self._var_d - - - - - diff --git a/test/test_measurements.py b/test/test_measurements.py index 1961c0c..2dc2a1a 100644 --- a/test/test_measurements.py +++ b/test/test_measurements.py @@ -3,8 +3,6 @@ from pyphetools.creation import Measurements from pyphetools.pp.v202 import Measurement as Measurement202 -from src.pyphetools.pp.v202 import OntologyClass - class TestMeasurements: