Skip to content

Commit

Permalink
captures frequency information as frequency_qualifier, and disease_id…
Browse files Browse the repository at this point in the history
… information as qualifier. Disease ids are also mapped to mondo and associations are collapsed by uniq (gene_id, hp_id, disease_id)
  • Loading branch information
AO33 committed Sep 5, 2024
1 parent a49ce47 commit 97cd838
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 19 deletions.
3 changes: 3 additions & 0 deletions src/monarch_phenotype_profile_ingest/download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
- url: http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt
local_name: data/genes_to_phenotype.txt

- url: https://data.monarchinitiative.org/mappings/latest/mondo.sssom.tsv
local_name: data/mondo.sssom.tsv


# For disease_to_phenotype
- url: http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa
Expand Down
103 changes: 86 additions & 17 deletions src/monarch_phenotype_profile_ingest/gene_to_phenotype_transform.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,105 @@
# General imports
import sys
import uuid # For generating UUIDs for associations
import subprocess # For counting number of lines in file so we know when to write out all information from koza
import pandas as pd

# Koza and biolink / pydantic imports
from koza.cli_utils import get_koza_app
from biolink_model.datamodel.pydanticmodel_v2 import (GeneToPhenotypicFeatureAssociation,
KnowledgeLevelEnum,
AgentTypeEnum)
from phenotype_ingest_utils import phenotype_frequency_to_hpo_term, Frequency


# Initiate koza app
def count_lines_in_file(file_path):
"""
Uses subprocess to grab the line count of a file and return it as an integer
"""

# The output is in the format: 'number_of_lines filename', so we split and take the first part
result = subprocess.run(['wc', '-l', file_path], stdout=subprocess.PIPE, text=True)
line_count = int(result.stdout.split()[0])
return line_count


def make_mondo_map(sssom_filepath, header=50):

# Read data into memory as df
df = pd.read_csv(sssom_filepath, sep='\t', header=header)

# Loop through relevant columns and create a map --> mondo
mondo_map = {}
for mid, pred, objid in zip(list(df["subject_id"]), list(df["predicate_id"]), list(df["object_id"])):

if pred != "skos:exactMatch":
continue

if objid not in mondo_map:
mondo_map.update({objid:mid})

# Note, currently only a single term does this mesh:D006344 --> MONDO:0006664, MONDO:0020437
elif mondo_map[objid] != mid:
continue
#print("- Same object id is found to map to multiple mondo ids... {}, {}, {}".format(objid, mondo_map[objid], mid))

return mondo_map


# Initiate koza app and mondo map
koza_app = get_koza_app("gene_to_phenotype")
mondo_map = make_mondo_map("./data/mondo.sssom.tsv")
line_count = count_lines_in_file(koza_app.source.config.files[0]) - 1 # Subtract 1 for header information

count = 0
g2p = {}
while (row := koza_app.get_row()) is not None:

gene_id = "NCBIGene:" + row["ncbi_gene_id"]
phenotype_id = row["hpo_id"]

association = GeneToPhenotypicFeatureAssociation(id="uuid:" + str(uuid.uuid1()),
subject=gene_id,
predicate="biolink:has_phenotype",
object=phenotype_id,
aggregator_knowledge_source=["infores:monarchinitiative"],
primary_knowledge_source="infores:hpo-annotations",
knowledge_level=KnowledgeLevelEnum.logical_entailment,
agent_type=AgentTypeEnum.automated_agent)

#print(association)
#count += 1
#if count == 10:
# sys.exit()
# break
# Nothing provided
if row["frequency"] == "-":
frequency = Frequency()
else:
# Raw frequencies - HPO term curies, ratios, percentages - normalized to HPO terms
frequency: Frequency = phenotype_frequency_to_hpo_term(row["frequency"])

# Convert to mondo id if possible, otherwise leave as is
org_id = row["disease_id"].replace("ORPHA:", "Orphanet:")
dis_id = org_id
if dis_id in mondo_map:
dis_id = mondo_map[dis_id]

# Create this association if it doesn't already exist and store so it can be updated (if necessary) later on in the file
key = (gene_id, phenotype_id, dis_id)
if key not in g2p:
quals = ';'.join(list(set([dis_id, org_id])))
association = GeneToPhenotypicFeatureAssociation(id="uuid:" + str(uuid.uuid1()),
subject=gene_id,
predicate="biolink:has_phenotype",
object=phenotype_id,
aggregator_knowledge_source=["infores:monarchinitiative"],
primary_knowledge_source="infores:hpo-annotations",
knowledge_level=KnowledgeLevelEnum.logical_entailment,
agent_type=AgentTypeEnum.automated_agent,
frequency_qualifier=frequency.frequency_qualifier if frequency.frequency_qualifier else None,
qualifier=quals)
g2p.update({key:association})

# Otherwise, update the frequency qualifier for the stored association
else:
quals = ';'.join(list(set([dis_id, org_id]+g2p[key].qualifier.split(';'))))
g2p[key].qualifier = quals
if g2p[key].frequency_qualifier == None:
g2p[key].frequency_qualifier = frequency.frequency_qualifier if frequency.frequency_qualifier else None

# Don't write out anything until we reach the very last line of the file
# This ensures that each association we make gets all of the relevant information pulled in (the proper frequency_qualifier in this case is what we are trying to capture along with the different disease ids)
count += 1
if count == line_count:
for k, assco in g2p.items():
koza_app.write(assco)

koza_app.write(association)
# Sanity check
if count > line_count:
print("- ERROR, Additional {} lines found... Most likely a discrepancy with the header line(s) expectations".format(count-line_count))
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata: "src/monarch_phenotype_profile_ingest/metadata.yaml"
files:
- 'data/genes_to_phenotype.txt'

header: 'none'
header: 0
format: 'csv'
delimiter: '\t'

Expand All @@ -26,7 +26,8 @@ edge_properties:
- 'subject'
- 'predicate'
- 'object'
- 'qualifiers'
- 'frequency_qualifier'
- 'qualifier'
- 'aggregator_knowledge_source'
- 'primary_knowledge_source'
- 'knowledge_level'
Expand Down

0 comments on commit 97cd838

Please sign in to comment.