Skip to content

Commit

Permalink
Merge pull request #81 from Sage-Bionetworks/jbeck/AG-1145/transform_…
Browse files Browse the repository at this point in the history
…overall_scores_testing

jbeck/AG-1145/transform overall scores testing
  • Loading branch information
jaclynbeck-sage authored Aug 15, 2023
2 parents df3a775 + 25e1303 commit 769baae
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 7 deletions.
1 change: 0 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@
ensg: ensembl_gene_id
hgnc_gene_id: hgnc_symbol
geneticsscore: genetics_score
literaturescore: literature_score
overall: target_risk_score
omicsscore: multi_omics_score
destination: *dest
Expand Down
6 changes: 1 addition & 5 deletions src/agoradatatools/etl/transform/overall_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,14 @@ def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame:
"overall",
"geneticsscore",
"omicsscore",
"literaturescore",
]

# create mapping to deal with missing values as they take different shape across the fields
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
scored = ["isscored_genetics", "isscored_omics"]
mapping = dict(zip(interesting_columns[3:], scored))

for field, is_scored in mapping.items():
df.loc[lambda row: row[is_scored] == "N", field] = np.nan

# LiteratureScore is a string in the source file, so convert to numeric
df["literaturescore"] = pd.to_numeric(df["literaturescore"])

# Remove identical rows (see AG-826)
return df[interesting_columns].drop_duplicates()
1 change: 0 additions & 1 deletion test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@
ensg: ensembl_gene_id
hgnc_gene_id: hgnc_symbol
geneticsscore: genetics_score
literaturescore: literature_score
overall: target_risk_score
omicsscore: multi_omics_score
destination: *dest
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath
378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y
378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y
389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N
376410_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,1.128843106,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N,N
386108_438,,ENSG00000271743,2.016073569,10997,,,,,,,,,,,,,,,,,,N,N,N,N
386671_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,Y,Y
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath
378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",,Y,Y,Y
378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,,Y,Y
389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,,N
376410_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,1.128843106,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N,
386108_438,,ENSG00000271743,2.016073569,10997,,,,,,,,,,,,,,,,,,N,N,N,N
386671_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,Y,Y
378979_429,HIVEP2,ENSG00000010818,3.185141534,3832,,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y
386672_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y
386673_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y
376411_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N,Y
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
[
{
"ensg": "ENSG00000010818",
"hgnc_gene_id": "HIVEP2",
"overall": 3.185141534,
"geneticsscore": 1.744062244,
"omicsscore": 1.441079289
},
{
"ensg": "ENSG00000069849",
"hgnc_gene_id": "ATP1B3",
"overall": 1.63040118,
"geneticsscore": 1.63040118,
"omicsscore": 0.0
},
{
"ensg": "ENSG00000116096",
"hgnc_gene_id": "SPR",
"overall": 3.663454319,
"geneticsscore": null,
"omicsscore": null
},
{
"ensg": "ENSG00000271743",
"hgnc_gene_id": null,
"overall": 2.016073569,
"geneticsscore": null,
"omicsscore": null
},
{
"ensg": "ENSG00000166979",
"hgnc_gene_id": "EVA1C",
"overall": 1.942294708,
"geneticsscore": null,
"omicsscore": null
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
[
{
"ensg": "ENSG00000010818",
"hgnc_gene_id": "HIVEP2",
"overall": 3.185141534,
"geneticsscore": 1.744062244,
"omicsscore": 1.441079289
},
{
"ensg": "ENSG00000069849",
"hgnc_gene_id": "ATP1B3",
"overall": 1.63040118,
"geneticsscore": 1.63040118,
"omicsscore": 0.0
},
{
"ensg": "ENSG00000116096",
"hgnc_gene_id": "SPR",
"overall": 3.663454319,
"geneticsscore": null,
"omicsscore": null
},
{
"ensg": "ENSG00000271743",
"hgnc_gene_id": null,
"overall": 2.016073569,
"geneticsscore": null,
"omicsscore": null
},
{
"ensg": "ENSG00000166979",
"hgnc_gene_id": "EVA1C",
"overall": 1.942294708,
"geneticsscore": null,
"omicsscore": null
},
{
"ensg": "ENSG00000010818",
"hgnc_gene_id": "HIVEP2",
"overall": 3.185141534,
"geneticsscore": null,
"omicsscore": 1.441079289
},
{
"ensg": "ENSG00000166979",
"hgnc_gene_id": "EVA1C",
"overall": 1.942294708,
"geneticsscore": 0.850712097,
"omicsscore": null
},
{
"ensg": "ENSG00000166979",
"hgnc_gene_id": "EVA1C",
"overall": 1.942294708,
"geneticsscore": 0.850712097,
"omicsscore": 1.091582611
}
]
65 changes: 65 additions & 0 deletions tests/transform/test_overall_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import pandas as pd
import pytest

from agoradatatools.etl.transform import overall_scores


class TestTransformOverallScores:
data_files_path = "tests/test_assets/overall_scores"
pass_test_data = [
( # Pass with good data
"test_overall_scores_good_input.csv",
"overall_scores_good_output.json",
),
( # Pass with score or isscored values missing
"test_overall_scores_missing_input.csv",
"overall_scores_missing_output.json",
),
]
pass_test_ids = [
"Pass with good data",
"Pass with missing score or isscored values",
]
fail_test_data = [
# No failure cases for this transform
]
fail_test_ids = [
# No failure cases for this transform
]

@pytest.mark.parametrize(
"input_file, expected_output_file", pass_test_data, ids=pass_test_ids
)
def test_transform_overall_scores_should_pass(
self, input_file, expected_output_file
):
# Note: overall_scores data is read from a Synapse table, so the index is actually supplied
# by Synapse instead of being numbers from 0..N. For test input, there are real index
# values from the Synapse table written to the first column of the CSV file, and they are
# retrieved with the 'index_col=0' argument, to try and be as close to the real data as
# possible.
scores_df = pd.read_csv(
os.path.join(self.data_files_path, "input", input_file), index_col=0
)
output_df = overall_scores.transform_overall_scores(df=scores_df)

# We have to call reset_index() because the JSON file is read in and assigned indices from
# 0..N, and assert_frame_equal() will fail if the two dataframes have different indices.
# Even if we were to let the input have indices from 0..N, removing duplicates causes the
# numbers to not be sequential, which is also an issue for assert_frame_equal().
output_df = output_df.reset_index(drop=True)

expected_df = pd.read_json(
os.path.join(self.data_files_path, "output", expected_output_file)
)
pd.testing.assert_frame_equal(output_df, expected_df)

"""
# Leaving code stub for failure case, in case we want to add this in the future
@pytest.mark.parametrize("input_file", fail_test_data, ids=fail_test_ids)
def test_transform_overall_scores_should_fail(self, input_file):
with pytest.raises(<Error type>):
scores_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file), index_col=0)
overall_scores.transform_overall_scores(df=scores_df)
"""

0 comments on commit 769baae

Please sign in to comment.