From e1919881b5e262272479dcd09b2039560d8d6172 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 12:50:41 -0700 Subject: [PATCH] #387 the recursive category picker is workingpython3 ontologies_jsonl_to_kg_jsonl.py ontologies.json maps/curies-to-categories.yaml null > ontology_nodes.json python3 ontologies_jsonl_to_kg_jsonl.py ontologies.json maps/curies-to-categories.yaml null > ontology_nodes.json --- ontologies_jsonl_to_kg_jsonl.py | 122 +++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 34 deletions(-) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index 93e119db..34494e56 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -42,6 +42,13 @@ "oboInOwl:hasDbXref": TEXT_KEY, "oboInOwl:xref": TEXT_KEY} +CLASS_TO_SUPERCLASSES = dict() +SAVED_NODE_INFO = dict() +SOURCE_INFO = dict() + +NODE_CATEGORY_MAPPINGS = dict() +PREFIX_MAPPINGS = dict() + CLASSES_DICT = dict() URI_MAP = dict() @@ -49,21 +56,57 @@ MISSING_ID_PREFIXES = set() +FILE_MAPPING = "file" +PREFIX_MAPPING = "prefix" +RECURSE_MAPPING = "recurse" + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('curiesToCategoriesYAML', type=str) arg_parser.add_argument('outputFile', type=str) return arg_parser.parse_args() +def categorize_node(node_id, recursion_depth=0): + node_prefix = node_id.split(':')[0] + + if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: + return NODE_CATEGORY_MAPPINGS[node_id][0] + + if node_prefix in PREFIX_MAPPINGS: + node_category = PREFIX_MAPPINGS[node_prefix] + NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) + return PREFIX_MAPPINGS[node_prefix] + + # Get try to get the most common superclass categorization + superclass_categorizations = dict() + highest_value = 0 + highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING + if recursion_depth == 10: + return kg2_util.BIOLINK_CATEGORY_NAMED_THING + + for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): + superclass_category = categorize_node(superclass, recursion_depth + 1) + if superclass_category not in superclass_categorizations: + superclass_categorizations[superclass_category] = 0 + superclass_categorizations[superclass_category] += 1 + if superclass_categorizations[superclass_category] > highest_value: + highest_value = superclass_categorizations[superclass_category] + highest_category = superclass_category + + NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) + return highest_category + + + def process_ontology_item(ontology_item): source = ontology_item.get(OWL_SOURCE_KEY, str()) for owl_class in ontology_item.get(OWL_CLASS_TAG, list()): # Typically genid classes which don't neatly map onto the KG2 schema if ID_TAG not in owl_class: continue - # TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes node_id = match_prefix(owl_class.get(ID_TAG, str())) if node_id is None: continue @@ -123,6 +166,7 @@ def process_ontology_item(ontology_item): if RESOURCE_KEY in edge: edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + superclasses = set() final_edges_list = list() for (edge_relation, edge_object) in edges_list: edge_object = match_prefix(edge_object) @@ -131,37 +175,38 @@ def process_ontology_item(ontology_item): edge_relation = match_prefix(edge_relation) if edge_relation is None: continue + if edge_relation in ["rdfs:subClassOf"]: + superclasses.add(edge_object) final_edges_list.append((edge_relation, edge_object)) + # Imperfect way to make it deterministic + superclasses = sorted(list(superclasses)) + if node_id not in CLASS_TO_SUPERCLASSES: + CLASS_TO_SUPERCLASSES[node_id] = list() + CLASS_TO_SUPERCLASSES[node_id] += superclasses + CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + + if node_id not in SAVED_NODE_INFO: + SAVED_NODE_INFO[node_id] = list() + SAVED_NODE_INFO[node_id].append({"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list}) + + for ontology_node in ontology_item.get("owl:Ontology", list()): + ontology_version = None + ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get("owl:versionInfo", list()) if TEXT_KEY in version] + ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get("owl:versionIRI", list()) if RESOURCE_KEY in version] + ontology_date = [version.get(TEXT_KEY, str()) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] + if len(ontology_versions) == 1: + ontology_version = ontology_versions[0] + elif len(ontology_version_iri) == 1: + ontology_version = ontology_version_iri[0] + elif len(ontology_date) == 1: + ontology_version = ontology_date[0] + + if ontology_version is None: + print("Warning: source", source, "lacks any versioning information.") + if source not in SOURCE_INFO: + SOURCE_INFO[source] = {"source": source, "ontology_date": ontology_date, "ontology_version": ontology_version} - # node_id = owl_class.get(ID_TAG, list()) - - # superclasses = [superclass.get(RESOURCE_KEY, str()) for superclass in owl_class.get(SUBCLASS_TAG, list())] - - # # Also query for comments? - # # Descriptions appear to be additive in current KG2 - # descriptions = owl_class.get(DESCRIPTION_TAG, list()) - # assert len(descriptions) <= 1 - # description = str() - # for element in descriptions: - # description += element[TEXT_KEY] - - # xrefs = [xref[TEXT_KEY] for xref in owl_class.get(XREF_TAG, list())] - # for element in owl_class.get(XREF_TAG, list()): - # xrefs.append(element[TEXT_KEY]) - - # exact_matches = [exact_match[RESOURCE_KEY] for exact_match in owl_class.get(EXACT_MATCH_TAG, list())] - - # names = owl_class.get(NAME_TAG, list()) - # assert len(names) <= 1, ontology_item - # name = str() - # for element in names: - # name += element[TEXT_KEY] - - # node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches} - - node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list} - print(json.dumps(node, indent=4)) def generate_uri_map(): uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml")) @@ -201,8 +246,15 @@ def match_prefix(node_id): if __name__ == '__main__': args = get_args() input_file_name = args.inputFile + curies_to_categories_file_name = args.curiesToCategoriesYAML output_file_name = args.outputFile + curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) + for mapping_node in curies_to_categories_data["term-mappings"]: + NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) + for prefix in curies_to_categories_data["prefix-mappings"]: + PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_data = input_read_jsonlines_info[0] @@ -211,9 +263,11 @@ def match_prefix(node_id): generate_uri_map() for ontology_item in input_data: process_ontology_item(ontology_item) - print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4)) - # print("OWL Classes:", owl_class_count) - # for key in KEYS_DICT: - # KEYS_DICT[key] = KEYS_DICT[key] / owl_class_count - # print(json.dumps(KEYS_DICT, indent=4, sort_keys=True)) \ No newline at end of file + for node_id in SAVED_NODE_INFO: + categorize_node(node_id) + + print(json.dumps(NODE_CATEGORY_MAPPINGS, indent=4)) + + # Can add this back in later + # print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4))