From ae432ac481bef8dafd3cce7d4dc8ac9302408065 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 12:06:21 -0700 Subject: [PATCH] #316 improving HL7 to actually work and adding in HPO --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index d7750974..e5f3d50d 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -73,6 +73,7 @@ CURIE_PREFIX_HCPCS = 'HCPCS' CURIE_PREFIX_HGNC = 'HGNC' CURIE_PREFIX_HMDB = 'HMDB' +CURIE_PREFIX_HP = 'HP' CURIE_PREFIX_IAO = 'IAO' CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry' CURIE_PREFIX_ISBN = 'ISBN' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 6a8a508d..b05eb1b5 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -39,6 +39,7 @@ HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS +HPO_PREFIX = kg2_util.CURIE_PREFIX_HP UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -107,12 +108,12 @@ def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) - iri = IRI_MAPPINGS[curie_prefix] + node_id - if curie_prefix == kg2_util.UMLS_SOURCE_PREFIX: + if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: if len(cuis) != 1: return None, None, None, None, None, None, None, None node_id = cuis[0] node_curie = make_node_id(curie_prefix, node_id) + iri = IRI_MAPPINGS[curie_prefix] + node_id category = TUI_MAPPINGS[str(tuple(tuis))] names = info.get(NAMES_KEY, dict()) @@ -267,6 +268,7 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'HL7') # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) hl7at = attributes.get('HL7AT', list()) hl7ii = attributes.get('HL7II', list()) hl7im = attributes.get('HL7IM', list()) @@ -306,6 +308,21 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_hpo_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HPO_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + sid = attributes.get('SID', list()) + hpo_comment = attributes.get('HPO_COMMENT', list()) + date_created = attributes.get('DATE_CREATED', list()) + syn_qualifier = attributes.get('SYN_QUALIFIER', list()) + ref = attributes.get('REF', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -368,6 +385,9 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): process_hl7_item(node_id, value, nodes_output, edges_output) if source == 'HPO': + process_hpo_item(node_id, value, nodes_output, edges_output) + + if source == 'ICD10PCS': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict())))