diff --git a/code/UI/autocomplete/GenerateQuestionTerms.py b/code/UI/autocomplete/GenerateQuestionTerms.py deleted file mode 100644 index 0b00e458f..000000000 --- a/code/UI/autocomplete/GenerateQuestionTerms.py +++ /dev/null @@ -1,84 +0,0 @@ -import sys, os -import json - -question_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - 'reasoningtool/QuestionAnswering') -sys.path.append(question_dir) -from Question import Question - -neo4j_helper_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - 'reasoningtool/kg-construction') -sys.path.append(neo4j_helper_dir) -from Neo4jConnection import Neo4jConnection - -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../") # code directory -from RTXConfiguration import RTXConfiguration - - -class GenerateQuestionTerms: - @staticmethod - def __get_question_templates(): - question_templates = [] - with open(os.path.join(question_dir, 'Questions.tsv'), 'r') as fid: - for line in fid.readlines(): - if line[0] == "#": - pass - else: - question = Question(line) - question_templates.append(question) - return question_templates - - @staticmethod - def __get_node_names(type): - # # connect to Neo4j - # f = open(os.path.join(neo4j_helper_dir, 'config.json'), 'r') - # config_data = f.read() - # f.close() - # config = json.loads(config_data) - - # create the RTXConfiguration object - rtxConfig = RTXConfiguration() - - kg2_neo4j_info = rtxConfig.get_neo4j_info("KG2pre") - - conn = Neo4jConnection(kg2_neo4j_info['bolt'], - kg2_neo4j_info['username'], - kg2_neo4j_info['password']) - names = conn.get_node_names(type) - conn.close() - - return names - - @staticmethod - def generateQuetionsToTXT(): - question_templates = GenerateQuestionTerms.__get_question_templates() - - have_writen = False - for i, question in enumerate(question_templates): - - # retrieve the type and template from question_template - if len(question.parameter_names) == 0: - continue - type = (question.parameter_names)[0] - question_template = question.restated_question_template - - names = GenerateQuestionTerms.__get_node_names(type) - - if len(names) != 0: - question_content = '' - for name in names: - question_phase = question_template.safe_substitute({type: name}) - question_content = question_content + question_phase + '\n' - - # write content to file - if have_writen: - with open('question_terms.txt', 'a') as w_f: - w_f.write(question_content) - else: - with open('question_terms.txt', 'w') as w_f: - w_f.write(question_content) - have_writen = True - - -if __name__ == '__main__': - GenerateQuestionTerms.generateQuetionsToTXT() diff --git a/code/reasoningtool/sharedtrunk.py b/code/archive/sharedtrunk.py similarity index 100% rename from code/reasoningtool/sharedtrunk.py rename to code/archive/sharedtrunk.py diff --git a/code/reasoningtool/CmdPubMedNGD.py b/code/reasoningtool/CmdPubMedNGD.py deleted file mode 100644 index 65b132550..000000000 --- a/code/reasoningtool/CmdPubMedNGD.py +++ /dev/null @@ -1,45 +0,0 @@ -'''Returns the Normalized Google semantic distance between two string MeSH terms - - Usage: python3 CmdPubMedNGD.py term1 term2 - - Example: python3 CmdPubMedNGD.py atherosclerosis hypercholesterolemia - - Output: JSON dump of dict (keys are "value" and "status") -''' -__author__ = 'Stephen Ramsey' -__copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey'] -__license__ = 'MIT' -__version__ = '0.1.0' -__maintainer__ = '' -__email__ = '' -__status__ = 'Prototype' - -import argparse -import math -import json -from QueryNCBIeUtils import QueryNCBIeUtils - -def main(): - parser = argparse.ArgumentParser(description="Returns the Normalized Google semantic distance between two string MeSH terms", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('terms', metavar='terms', type=str, nargs=2, help='two string arguments; must be MeSH terms') - args = parser.parse_args() - mesh_terms = args.terms - ngd = QueryNCBIeUtils.normalized_google_distance(*mesh_terms) - res_dict = dict() - res_dict['value'] = ngd - - if math.isnan(ngd): - res_dict['status'] = 'unsuccessful' - if not QueryNCBIeUtils.is_mesh_term(mesh_terms[0]): - res_dict['status'] += '; Term 1 is not a valid MeSH term' - if not QueryNCBIeUtils.is_mesh_term(mesh_terms[1]): - res_dict['status'] += '; Term 2 is not a valid MeSH term' - else: - res_dict['status'] = 'success' - print(json.dumps(res_dict)) - -if __name__ == "__main__": - main() - diff --git a/code/reasoningtool/MLDrugRepurposing/MyChemGT.py b/code/reasoningtool/MLDrugRepurposing/MyChemGT.py deleted file mode 100644 index d722ece84..000000000 --- a/code/reasoningtool/MLDrugRepurposing/MyChemGT.py +++ /dev/null @@ -1,111 +0,0 @@ -import sys -import os -new_path = os.path.join(os.getcwd(), '..', 'kg-construction') -sys.path.insert(0, new_path) - -new_path2 = os.path.join(os.getcwd(), '..', 'SemMedDB') -sys.path.insert(0, new_path2) - -from SynonymMapper import SynonymMapper -from QueryMyChem import QueryMyChem -from DrugMapper import DrugMapper -from QueryUMLSApi import QueryUMLS -import requests -import pandas -import time -import requests_cache -import numpy -import urllib -import ast - -requests_cache.install_cache('MyChemCache') - -df = pandas.read_csv('data/drugs.csv') -df = df.loc[df["id"].str.upper().str.startswith('CHEMBL', na=False)].reset_index(drop=True) - -def map_drug_to_ontology(chembl_id): - """ - mapping between a drug and Disease Ontology IDs and/or Human Phenotype Ontology IDs corresponding to indications - - :param chembl_id: The CHEMBL ID for a drug - - :return: A dictionary with two fields ('indication' and 'contraindication'). Each field is a set of strings - containing the found hp / omim / doid ids or empty set if none were found - """ - indication_onto_set = set() - contraindication_onto_set = set() - if not isinstance(chembl_id, str): - return {'indications': indication_onto_set, "contraindications": contraindication_onto_set} - drug_use = QueryMyChem.get_drug_use(chembl_id) - indications = drug_use['indications'] - contraindications = drug_use['contraindications'] - sm = SynonymMapper() - for indication in indications: - if 'snomed_concept_id' in indication.keys(): - oxo_results = sm.get_all_from_oxo('SNOMEDCT:' + str(indication['snomed_concept_id']), ['DOID', 'OMIM', 'HP']) - if oxo_results is not None: - for oxo_result in oxo_results: - indication_onto_set.add(oxo_result) - else: - oxo_results = sm.get_all_from_oxo('SNOMEDCT:' + str(indication['snomed_concept_id']), ['UMLS']) - if oxo_results is not None: - for oxo_result in oxo_results: - indication_onto_set.add(oxo_result) - for contraindication in contraindications: - if 'snomed_concept_id' in contraindication.keys(): - oxo_results = sm.get_all_from_oxo('SNOMEDCT:' + str(contraindication['snomed_concept_id']), ['DOID', 'OMIM', 'HP']) - if oxo_results is not None: - for oxo_result in oxo_results: - contraindication_onto_set.add(oxo_result) - else: - oxo_results = sm.get_all_from_oxo('SNOMEDCT:' + str(contraindication['snomed_concept_id']), ['UMLS']) - if oxo_results is not None: - for oxo_result in oxo_results: - contraindication_onto_set.add(oxo_result) - return {'indications': indication_onto_set, "contraindications": contraindication_onto_set} - -# Initialized the lists used to create the dataframes -mychem_tp_list = [] -mychem_tn_list = [] -# UMLS targets will be seperated to be converted into DOID, HP, or OMIM -umls_tn_list = [] -umls_tp_list = [] - -d = 0 -for drug in df['id']: - chembl_id = drug.split(':')[1] - if not chembl_id.startswith('CHEMBL'): - chembl_id = 'CHEMBL' + chembl_id - elif chembl_id.startswith('CHEMBL.COMPOUND'): - curie_id = curie_id.split(':')[1] - res = map_drug_to_ontology(chembl_id) - # Load indications and contraintications into their respective lists - for ind in res['indications']: - if ind.startswith('UMLS:'): - umls_tp_list += [[drug,ind.split(':')[1]]] - else: - mychem_tp_list += [[drug,ind]] - for cont in res['contraindications']: - if cont.startswith('UMLS:'): - umls_tn_list += [[drug,cont.split(':')[1]]] - else: - mychem_tn_list += [[drug,cont]] - d += 1 - # This prints percentage progress every 10%. Uncomment if you want this. - #if d % int(len(df)/10 + 1) == 0: - # print(d/len(df)) - -# Convert lists to dataframes -tp_df = pandas.DataFrame(mychem_tp_list,columns = ['source','target']) -tn_df = pandas.DataFrame(mychem_tn_list,columns = ['source','target']) -umls_tp_df = pandas.DataFrame(umls_tp_list,columns = ['source','target']) -umls_tn_df = pandas.DataFrame(umls_tn_list,columns = ['source','target']) - -# Save dataframes as csvs -tp_df.to_csv("data/mychem_tp.csv",index=False) -tn_df.to_csv("data/mychem_tn.csv",index=False) -umls_tp_df.to_csv("data/mychem_tp_umls.csv",index=False) -umls_tn_df.to_csv("data/mychem_tn_umls.csv",index=False) - - - diff --git a/code/reasoningtool/MLDrugRepurposing/README.md b/code/reasoningtool/MLDrugRepurposing/README.md index 557d5d58e..95a4e8e94 100644 --- a/code/reasoningtool/MLDrugRepurposing/README.md +++ b/code/reasoningtool/MLDrugRepurposing/README.md @@ -1,10 +1,15 @@ # NOTE: dependencies have been removed -As of May 15, 2025, one of the modules on which the code in this directory +On May 15, 2025, one of the modules on which some code in this directory depends, `RTX/code/reasoningtool/SemMedDB`, was deleted from the `RTXteam/RTX` project area (see #2454). But if you need this code, you can obtain it from any -RTXteam/RTX [release](https://github.com/RTXteam/RTX/releases) (the code in the -SemMedDB project directory hasn't changed since 2019). +earlier RTXteam/RTX [release](https://github.com/RTXteam/RTX/releases). + +On Oct. 9, 2025, a module, +`RTX/code/reasoningtool/kg-construction/SynonymMapper.py`, on which the +`MyChemGT.py` module in this directory depends, was deleted from the +`RTXteam/RTX` project area (see #2582). But if you need this code, you can +obtain it from any earlier RTXteam/RTX [release](https://github.com/RTXteam/RTX/releases). # Make sure python is set up correctly diff --git a/code/reasoningtool/Neo4jToNetworkX.py b/code/reasoningtool/Neo4jToNetworkX.py deleted file mode 100644 index c2fa9c8a1..000000000 --- a/code/reasoningtool/Neo4jToNetworkX.py +++ /dev/null @@ -1,63 +0,0 @@ -import networkx as nx -import cypher -from collections import namedtuple - -import sys, os - -# Get rtxConfig -sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../") # code directory -from RTXConfiguration import RTXConfiguration -rtxConfig = RTXConfiguration() - -# Connection information for the ipython-cypher package -connection = "http://" + rtxConfig.neo4j_username + ":" + rtxConfig.neo4j_password + "@" + rtxConfig.neo4j_database -DEFAULT_CONFIGURABLE = { - "auto_limit": 0, - "style": 'DEFAULT', - "short_errors": True, - "data_contents": True, - "display_limit": 0, - "auto_pandas": False, - "auto_html": False, - "auto_networkx": False, - "rest": False, - "feedback": False, # turn off verbosity in ipython-cypher - "uri": connection, -} -DefaultConfigurable = namedtuple( - "DefaultConfigurable", - ", ".join([k for k in DEFAULT_CONFIGURABLE.keys()]) -) -config = DefaultConfigurable(**DEFAULT_CONFIGURABLE) - -# Convert neo4j subgraph (from cypher query) into a networkx graph -def get_graph(res, directed=True): - """ - This function takes the result (subgraph) of a ipython-cypher query and builds a networkx graph from it - :param res: output from an ipython-cypher query - :param directed: Flag indicating if the resulting graph should be treated as directed or not - :return: networkx graph (MultiDiGraph or MultiGraph) - """ - if nx is None: - raise ImportError("Try installing NetworkX first.") - if directed: - graph = nx.MultiDiGraph() - else: - graph = nx.MultiGraph() - for item in res._results.graph: - for node in item['nodes']: - graph.add_node(node['id'], properties=node['properties'], labels=node['labels'], names=node['properties']['name'], description=node['properties']['description']) - for rel in item['relationships']: - graph.add_edge(rel['startNode'], rel['endNode'], id=rel['id'], properties=rel['properties'], type=rel['type']) - return graph - -def test_get_graph(): - query = "MATCH path=allShortestPaths((s:omim_disease)-[*1..%d]-(t:disont_disease)) " \ - "WHERE s.name='%s' AND t.name='%s' " \ - "RETURN path" % (4, 'OMIM:137920', 'DOID:11476') - res = cypher.run(query, conn=connection, config=config) - graph = get_graph(res, directed=True) - if type(graph) is not nx.classes.MultiDiGraph: - raise(Exception("A networkx graph was not returned")) - if graph.number_of_nodes() < 1: - raise(Exception("An empty graph was returned")) diff --git a/code/reasoningtool/QuestionAnswering/LilGimTestQuestion.py b/code/reasoningtool/QuestionAnswering/LilGimTestQuestion.py deleted file mode 100644 index f9f0465bf..000000000 --- a/code/reasoningtool/QuestionAnswering/LilGimTestQuestion.py +++ /dev/null @@ -1,176 +0,0 @@ -# This script will return X that are similar to Y based on high Jaccard index of common one-hop nodes Z (X<->Z<->Y) - -import os -import sys -import argparse -# PyCharm doesn't play well with relative imports + python console + terminal -try: - from code.reasoningtool import ReasoningUtilities as RU -except ImportError: - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - import ReasoningUtilities as RU - -import FormatOutput -import networkx as nx -import QueryLilGIM -import CustomExceptions -import ast - -class LilGim: - - def __init__(self): - None - - @staticmethod - def answer(tissue_id, input_protein_list, use_json=False, num_show=20, rev=True): - - # Initialize the response class - response = FormatOutput.FormatResponse(6) - - # Make sure everything exists in the graph - if not RU.node_exists_with_property(tissue_id, "id"): - tissue_id = RU.get_node_property(tissue_id, "id", node_label="anatomical_entity") - - for i in range(len(input_protein_list)): - id = input_protein_list[i] - if not RU.node_exists_with_property(id, "id"): - input_protein_list[i] = RU.get_node_property(id, "id", node_label="protein") - - # Initialize the QueryLilGim class - q = QueryLilGIM.QueryLilGIM() - - # get the description - tissue_description = RU.get_node_property(tissue_id, 'name', node_label="anatomical_entity") - - # Get the correlated proteins - try: - correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy(tissue_id, tuple(input_protein_list)) - #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347} - except: - error_message = "Lil'GIM is experiencing a problem." - error_code = "LilGIMerror" - response.add_error_message(error_code, error_message) - response.print() - return 1 - - # as a list of tuples - correlated_proteins_tupes = [] - for k, v in correlated_proteins_dict.items(): - correlated_proteins_tupes.append((k, v)) - - # sort by freq - correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes, key=lambda x: x[1], reverse=rev) - correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[0:num_show] - correlated_proteins_tupes = correlated_proteins_tupes_sorted - - - # return the results - if not use_json: - try: - protein_descriptions = RU.get_node_property(input_protein_list[0], "name", node_label="protein", name_type="id") - except: - protein_descriptions = input_protein_list[0] - for id in input_protein_list[1:-1]: - protein_descriptions += ", " - try: - protein_descriptions += RU.get_node_property(id, "name", node_label="protein", name_type="id") - except: - protein_descriptions += id - if len(input_protein_list) > 1: - try: - protein_descriptions += ", and %s" % RU.get_node_property(input_protein_list[-1], "name", node_label="protein", name_type="id") - except: - protein_descriptions += ", and %s" % input_protein_list[-1] - if rev: - to_print = "In the tissue: %s, the proteins that correlate most with %s" % (tissue_description, protein_descriptions) - else: - to_print = "In the tissue: %s, the proteins that correlate least with %s" % (tissue_description, protein_descriptions) - to_print += " according to Lil'GIM, are:\n" - for id, val in correlated_proteins_tupes_sorted: - try: - to_print += "protein: %s\t correlation %f\n" % (RU.get_node_property(id, "name", node_label="protein", name_type="id"), val) - except: - to_print += "protein: %s\t correlation %f\n" % (id, val) - print(to_print) - else: - # otherwise, you want a JSON output - protein_descriptions = [] - is_in_KG_list = [] - for protein, corr in correlated_proteins_tupes: - try: - description = RU.get_node_property(protein, "name", node_label="protein", name_type="id") - protein_descriptions.append(description) - is_in_KG_list.append(True) - except: - protein_description = protein - protein_descriptions.append(protein_description) - is_in_KG_list.append(False) - - # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG - correlated_proteins_tupes_in_KG = [] - for i in range(len(correlated_proteins_tupes)): - if is_in_KG_list[i]: - correlated_proteins_tupes_in_KG.append(correlated_proteins_tupes[i]) - - # Return the results - full_g = RU.get_graph_from_nodes([id for id, val in correlated_proteins_tupes_in_KG], node_property_label="id") - id2node = dict() - for nx_id, node in full_g.nodes(data=True): - id2node[node['properties']['id']] = node - for id, corr in correlated_proteins_tupes_in_KG: - to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." %(tissue_description, RU.get_node_property(id, "name", node_label="protein", name_type="id"), corr) - response.add_subgraph([(id, id2node[id])], [], to_print, corr) - response.print() - - @staticmethod - def describe(): - output = "Answers questions of the form: 'What proteins correlate with [$protein1, $protein2,...,$proteinK?] in blood?'" + "\n" - # TODO: subsample disease nodes - return output - - -def main(): - parser = argparse.ArgumentParser(description="Answers questions of the form: 'What proteins correlate with [$protein1, $protein2,...,$proteinK?] in blood?'", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-t', '--tissue', type=str, help="Tissue id/name", default="UBERON:0002384") - parser.add_argument('-r', '--reverse', action='store_true', help="Include flag if you want the least correlations.") - parser.add_argument('-p', '--proteins', type=str, help="List of proteins.", default="[\'UniProtKB:P12004\']") - parser.add_argument('-j', '--json', action='store_true', help='Flag specifying that results should be printed in JSON format (to stdout)', default=False) - parser.add_argument('--describe', action='store_true', help='Print a description of the question to stdout and quit', default=False) - parser.add_argument('--num_show', type=int, help='Maximum number of results to return', default=20) - - # Parse and check args - args = parser.parse_args() - tissue_id = args.tissue - is_reverse = args.reverse - proteins = args.proteins - use_json = args.json - describe_flag = args.describe - num_show = args.num_show - - # Convert the string to an actual list - #print(proteins) - proteins_preserved = proteins - try: - proteins = proteins.replace(",", "','").replace("[", "['").replace("]", "']") - protein_list = ast.literal_eval(proteins) - protein_list_strip = [] - for protein in protein_list: - protein_list_strip.append(protein.strip()) - - protein_list = protein_list_strip - - except: - protein_list = eval(proteins_preserved) - - # Initialize the question class - Q = LilGim() - - if describe_flag: - res = Q.describe() - print(res) - else: - Q.answer(tissue_id, protein_list, use_json=use_json, num_show=num_show, rev=not(is_reverse)) - -if __name__ == "__main__": - main() diff --git a/code/reasoningtool/QuestionAnswering/QueryLilGIM.py b/code/reasoningtool/QuestionAnswering/QueryLilGIM.py deleted file mode 100644 index 67addd85c..000000000 --- a/code/reasoningtool/QuestionAnswering/QueryLilGIM.py +++ /dev/null @@ -1,181 +0,0 @@ -""" This module defines the module QueryLilGIM. QueryLilGIM provides -a method for finding neighboring genes (in a distance space defined -by correlation similarity) for a set of query genes, based on gene -expression data that are stored in a Google BigQuery table. The search -for neighboring genes is based on correlation measurements computed -in a specific anatomical context (specified by the user of this module). - -Based on an example Jupyter notebook provided here: -https://github.com/NCATS-Tangerine/cq-notebooks/blob/master/BigGIM/lilGIM%20and%20BigCLAM%20Examples.ipynb -""" - -__author__ = 'Stephen Ramsey' -__copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey', 'Theo Knijnenburg', 'John Earls', 'David Palzer'] -__license__ = 'MIT' -__version__ = '0.1.0' -__maintainer__ = '' -__email__ = '' -__status__ = 'Prototype' - -import urllib.request -import urllib.parse -# NOTE: this module *WILL NOT WORK* if you use requests package if caching via requests-cache is turned on -import json -import pandas -import time -import sys -import os -import functools -import ast - -from ReasoningUtilities import get_nodes_that_match_in_list - -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../kg-construction'))) # Go up one level and look for it - -from QueryEBIOLS import QueryEBIOLS -from QueryMyGene import QueryMyGene -import CachedMethods - -class QueryLilGIM: - BASE_URL = "http://biggim.ncats.io/api" - ENDPOINT = "lilgim/query" - DEFAULT_LIMIT = 100 - - def __init__(self, limit=DEFAULT_LIMIT): - self.limit = limit - self.mg = QueryMyGene() - - @staticmethod - def _get(endpoint, data={}, base_url=BASE_URL): - post_params = urllib.parse.urlencode(data) - url = '%s/%s?%s' % (base_url, endpoint, post_params) - req = urllib.request.urlopen(urllib.request.Request(url, headers={'Accept': 'application/json'})) -# print("Sent: GET %s?%s" % (req.request.url, req.request.body)) - return json.loads(req.read().decode()) - - @staticmethod - def _jprint(dct): - print(json.dumps(dct, indent=2)) - - @staticmethod - def _wrapper(endpoint, data={}, base_url=BASE_URL): - try: - response = QueryLilGIM._get(endpoint, data, base_url) -# QueryLilGIM._jprint(response) - except BaseException as e: - print(e, file=sys.stderr) - if e.response.status_code == 400: - QueryLilGIM._jprint(e.response.json(), file=sys.stderr) - raise - try: - ctr = 1 - while True: - query_status = QueryLilGIM._get('%s/status/%s' % (endpoint.split('/')[0], - response['request_id'],)) -# QueryLilGIM._jprint(query_status) - if query_status['status'] != 'running': - # query has finished - break - else: - time.sleep(ctr) - ctr += 1 - # linear backoff - except BaseException as e: - print(e, file=sys.stderr) - if e.response.status_code == 400: - QueryLilGIM._jprint(e.response.json(), file=sys.stderr) - raise - return pandas.concat(map(pandas.read_csv, query_status['request_uri'])) - - # anatomy_curie_id_str: string CURIE ID for an Uberon anatomy term - # protein_set_curie_id_str: a tuple containing one or more string CURIE IDs for proteins (UniProtKB) - # return value: a dict in which keys are string Uniprot CURIE IDs and values are correlation coeffs - @CachedMethods.register - @functools.lru_cache(maxsize=1024, typed=False) - def query_neighbor_genes_for_gene_set_in_a_given_anatomy(self, - anatomy_curie_id_str, - protein_set_curie_id_str): - - assert type(protein_set_curie_id_str) == tuple - assert len(protein_set_curie_id_str) > 0 - assert type(anatomy_curie_id_str) == str - - # convert UBERON anatomy curie ID str to a brenda anatomy ID - assert anatomy_curie_id_str.startswith("UBERON:") - bto_id_set = QueryEBIOLS.get_bto_id_for_uberon_id(anatomy_curie_id_str) - ret_dict = dict() - if len(bto_id_set) == 0: - return ret_dict - - assert len(bto_id_set) == 1 - - bto_term = QueryEBIOLS.get_bto_term_for_bto_id(next(iter(bto_id_set))).replace(" ", "_") - - entrez_gene_ids = set() - entrez_gene_ids_int = set() - - # convert uniprot IDs to Entrez gene IDs - for protein_curie_id_str in protein_set_curie_id_str: - assert protein_curie_id_str.startswith("UniProtKB:") - uniprot_acc = protein_curie_id_str.split(":")[1] - entrez_gene_id_set = self.mg.convert_uniprot_id_to_entrez_gene_ID(uniprot_acc) - for entrez_gene_id in entrez_gene_id_set: - entrez_gene_ids_int.add(entrez_gene_id) - entrez_gene_ids.add(str(entrez_gene_id)) - - entrez_gene_ids_str = ",".join(entrez_gene_ids) - - data = {"ids": entrez_gene_ids_str, - "tissue": bto_term, - "limit": self.limit} - - results = self._wrapper(self.ENDPOINT, data) - - ret_dict = dict() - gene_dict = dict() - - for index, row in results.iterrows(): - gene1 = row["Gene1"] - gene2 = row["Gene2"] - avg_corr = row["aveCorr"] - assert type(gene1) == int - assert type(gene2) == int - assert type(avg_corr) == float - if gene1 in entrez_gene_ids_int: - if gene2 in entrez_gene_ids_int: - # do nothing since this is not a new gene - new_gene_id = None - else: - # gene2 is the new gene - new_gene_id = gene2 - else: - if gene2 in entrez_gene_ids_int: - new_gene_id = gene1 - else: - print("neither gene was in the set of query genes, this should not happen", file=sys.stderr) - assert False - if new_gene_id is not None: - gene_dict[new_gene_id] = avg_corr - - for gene_id, avg_corr in gene_dict.items(): - uniprot_id_set = self.mg.convert_entrez_gene_id_to_uniprot_id(gene_id) - if len(uniprot_id_set) > 0: - for uniprot_id in uniprot_id_set: - ret_dict["UniProtKB:" + uniprot_id] = avg_corr - - query_res = get_nodes_that_match_in_list(ret_dict.keys(), 'protein') - res_list = str(query_res[0]) - res_list = ast.literal_eval(res_list[res_list.find('['):-1]) - - for uniprot_id in list(ret_dict): - if uniprot_id not in res_list: - ret_dict.pop(uniprot_id) - - return ret_dict - -if __name__ == '__main__': - qlg = QueryLilGIM() - print(qlg.query_neighbor_genes_for_gene_set_in_a_given_anatomy("UBERON:0002384", ("UniProtKB:P12004",))) - print(qlg.query_neighbor_genes_for_gene_set_in_a_given_anatomy("UBERON:0000178", ("UniProtKB:P01579",))) - # print(qlg.query_neighbor_genes_for_gene_set_in_a_given_anatomy("UBERON:0000178", {"UniProtKB:P01579"})) diff --git a/code/reasoningtool/QuestionAnswering/README.md b/code/reasoningtool/QuestionAnswering/README.md new file mode 100644 index 000000000..0fb6113b1 --- /dev/null +++ b/code/reasoningtool/QuestionAnswering/README.md @@ -0,0 +1,14 @@ +# What is the code in this directory? + +This directory contains mostly deprecated old code from the Feasibility Assessment Phase of +the Biomedical Data Translator project. Breakage of modules in this directory due to +elimination of deprecated downstream dependencies will be noted in this README.md. + +As of Oct. 9, 2025, the function `get_ngd_for_all` was +removed from the module `RTX/code/reasoningtool/kg-construction/NormGoogleDistance.py`; +this change breaks a number of modules in this directory. If you need that function, +you can get it from the `NormGoogleDistance.py` module in any older release of +the `RTXteam/RTX` project code. + + + diff --git a/code/reasoningtool/kg-construction/KGNodeIndex.py b/code/reasoningtool/kg-construction/KGNodeIndex.py deleted file mode 100644 index cebb5ace7..000000000 --- a/code/reasoningtool/kg-construction/KGNodeIndex.py +++ /dev/null @@ -1,669 +0,0 @@ -#!/usr/bin/env python3 -# -# Class to build and query an index of nodes in the KG -# -import os -import sys -import re -import timeit -import argparse -import sqlite3 - -sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../") -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../QuestionAnswering") -import ReasoningUtilities as RU -#from RTXConfiguration import RTXConfiguration - -# Testing and debugging flags -DEBUG = False -TESTSUFFIX = "" -#TESTSUFFIX = "_test2" - - -# Main class -class KGNodeIndex: - - # Constructor - def __init__(self): - filepath = os.path.dirname(os.path.abspath(__file__)) - self.databaseLocation = filepath - self.lookup_table = {} - - #self.databaseLocation = 'C:/Users/ericd/Documents/zztmp' - #print(f"INFO: Temporarily using filepath {self.databaseLocation}") - - is_rtx_production = False - #if re.match("/mnt/data/orangeboard", filepath): - # is_rtx_production = True - #if DEBUG: - # print("INFO: is_rtx_production="+str(is_rtx_production)) - - if is_rtx_production: - self.databaseName = "RTXFeedback" - self.engine_type = "mysql" - else: - self.databaseName = "KGNodeIndex.sqlite" - self.engine_type = "sqlite" - self.connection = None - self.connect() - - - # Destructor - def __del__(self): - if self.engine_type == "mysql": - self.disconnect() - else: - pass - - - # Create and store a database connection - def connect(self): - # If already connected, don't need to do it again - if self.connection is not None: - return - # Create an engine object - if DEBUG is True: - print("INFO: Connecting to database") - if self.engine_type == "sqlite": - self.connection = sqlite3.connect(f"{self.databaseLocation}/{self.databaseName}") - else: - pass - #rtxConfig = RTXConfiguration() - #engine = create_engine("mysql+pymysql://" + rtxConfig.mysql_feedback_username + ":" + - # rtxConfig.mysql_feedback_password + "@" + rtxConfig.mysql_feedback_host + "/" + self.databaseName) - - - # Destroy the database connection - def disconnect(self): - - if self.connection is None: - if DEBUG is True: - print("INFO: Skip disconnecting from database") - return - - if DEBUG is True: - print("INFO: Disconnecting from database") - self.connection.close() - self.connection = None - - - # Delete and create the kgnode table - def create_tables(self): - if DEBUG is True: - print("INFO: Creating database "+self.databaseName) - self.connection.execute(f"DROP TABLE IF EXISTS kgnode{TESTSUFFIX}") - self.connection.execute(f"DROP TABLE IF EXISTS kg1node{TESTSUFFIX}") - self.connection.execute(f"CREATE TABLE kg1node{TESTSUFFIX}( curie VARCHAR(255), name VARCHAR(255), type VARCHAR(255), reference_curie VARCHAR(255) )" ) - self.connection.execute(f"DROP TABLE IF EXISTS kg2node{TESTSUFFIX}") - self.connection.execute(f"CREATE TABLE kg2node{TESTSUFFIX}( curie VARCHAR(255), name VARCHAR(255), type VARCHAR(255), reference_curie VARCHAR(255) )" ) - - - # Create the KG node table - def populate_table(self, kg_name): - - if kg_name == 'KG1': - table_name = 'kg1node' - file_suffix = '_KG1' - elif kg_name == 'KG2': - table_name = 'kg2node' - file_suffix = '_KG2' - else: - print("ERROR: kg_name must be either 'KG1' or 'KG2'") - sys.exit(5) - - filename = os.path.dirname(os.path.abspath(__file__)) + f"/../../../data/KGmetadata/NodeNamesDescriptions{file_suffix}.tsv" - filesize = os.path.getsize(filename) - previous_percentage = -1 - bytes_read = 0 - - lineCounter = 0 - fh = open(filename, 'r', encoding="latin-1", errors="replace") - print(f"INFO: Populating table {table_name}") - - # Have a dict for items already inserted so that we don't insert them twice - namesDict = {} - rows = [] - - for line in fh: - bytes_read += len(line) - columns = line.strip().split("\t") - curie = columns[0] - name = columns[1] - type = columns[2] - - #### For debugging problems - debug_flag = False - #if 'P06865' in curie: debug_flag = True - - # Some cleanup - - # Many MONDO names have a ' (disease)' suffix, which seems undesirable, so strip them out - if 'MONDO:' in curie: - name = re.sub(r'\s*\(disease\)\s*$','',name) - # Many PR names have a ' (human)' suffix, which seems undesirable, so strip them out - if 'PR:' in curie: - name = re.sub(r'\s*\(human\)\s*$','',name) - - # Create a list of all the possible names we will add to the database - names = [name] - - if re.match("OMIM:", curie): - multipleNames = name.split("; ") - if len(multipleNames) > 1: - for possibleName in multipleNames: - if possibleName == multipleNames[0]: - next - names.append(possibleName) - - elif re.match("R-HSA-", curie): - # Also store the path name without embedded abbreviations - if re.search(r' \([A-Z0-9]{1,8}\)', name): - newName = re.sub( - r' \([A-Z0-9]{1,8}\)', "", name, flags=re.IGNORECASE) - names.append(newName) - - # If this is a UniProt identifier, also add the CURIE and the naked identifier without the prefix - elif re.match("UniProtKB:[A-Z][A-Z0-9]{5}", curie) or re.match("UniProtKB:A[A-Z0-9]{9}", curie): - tmp = re.sub("UniProtKB:", "", curie) - names.append(tmp) - - # If this is a PR identifier, also add the CURIE and the naked identifier without the prefix - elif re.match("PR:[A-Z][A-Z0-9]{5}", curie) or re.match("PR:A[A-Z0-9]{9}", curie): - tmp = re.sub("PR:", "", curie) - names.append(tmp) - - # Create duplicates for various DoctorName's diseases - for name in names: - if re.search("'s ", name): - newName = re.sub("'s ", "s ", name) - names.append(newName) - #print(" duplicated _"+name+"_ to _"+newName+"_") - newName = re.sub("'s ", " ", name) - names.append(newName) - #print(" duplicated _"+name+"_ to _"+newName+"_") - - # A few special cases - if re.search("alzheimer ", name, flags=re.IGNORECASE): - newName = re.sub("alzheimer ", "alzheimers ", - name, flags=re.IGNORECASE) - names.append(newName) - #print(" duplicated _"+name+"_ to _"+newName+"_") - - newName = re.sub("alzheimer ", "alzheimer's ", - name, flags=re.IGNORECASE) - names.append(newName) - #print(" duplicated _"+name+"_ to _"+newName+"_") - - # Add all the possible names to the database - if debug_flag: - print() - print(names) - - for name in names: - name = name.upper() - if name in namesDict and curie in namesDict[name]: - continue - - # Hard-coded list of short abbreviations to ignore because they're also English - if name == "IS": - continue - if name == "AS": - continue - - # Check and add an entry to the lookup table - reference_curie = None - if name in self.lookup_table: - reference_curie = self.lookup_table[name] - if curie not in self.lookup_table: - self.lookup_table[curie] = reference_curie - else: - reference_curie = curie - if curie in self.lookup_table: - self.lookup_table[name] = reference_curie - else: - self.lookup_table[curie] = reference_curie - self.lookup_table[name] = reference_curie - if debug_flag: print(f"reference_curie for {name} is {reference_curie}") - - # Add a row for this node - rows.append([curie,name,type,reference_curie]) - if debug_flag: print([curie,name,type,reference_curie]) - if name not in namesDict: - namesDict[name] = {} - namesDict[name][curie] = 1 - - # Try also adding in the curie as a resolvable name - if curie not in namesDict: - if debug_flag: print(f"reference_curie for {curie} is {reference_curie}") - rows.append([curie,curie.upper(),type,reference_curie]) - if debug_flag: print([curie,curie.upper(),type,reference_curie]) - if curie not in namesDict: - namesDict[curie] = {} - namesDict[curie][curie] = 1 - - # Commit every 10000 lines - percentage = int(bytes_read*100.0/filesize) - if percentage > previous_percentage: - self.connection.executemany(f"INSERT INTO {table_name}{TESTSUFFIX}(curie,name,type,reference_curie) values (?,?,?,?)", rows) - self.connection.commit() - rows = [] - previous_percentage = percentage - print(str(percentage)+"%..", end='', flush=True) - - debug_flag = False - lineCounter += 1 - - # Write out the last rows - if len(rows) > 0: - self.connection.executemany(f"INSERT INTO {table_name}{TESTSUFFIX}(curie,name,type,reference_curie) values (?,?,?,?)", rows) - self.connection.commit() - print("100..", end='', flush=True) - - fh.close() - print("") - - - def create_indexes(self, kg_name): - - if kg_name == 'KG1': - table_name = 'kg1node' - elif kg_name == 'KG2': - table_name = 'kg2node' - else: - print("ERROR: kg_name must be either 'KG1' or 'KG2'") - sys.exit(5) - - print(f"INFO: Creating INDEXes on {table_name}{TESTSUFFIX}") - self.connection.execute(f"CREATE INDEX idx_{table_name}{TESTSUFFIX}_name ON {table_name}{TESTSUFFIX}(name)") - self.connection.execute(f"CREATE INDEX idx_{table_name}{TESTSUFFIX}_curie ON {table_name}{TESTSUFFIX}(curie)") - self.connection.execute(f"CREATE INDEX idx_{table_name}{TESTSUFFIX}_reference_curie ON {table_name}{TESTSUFFIX}(reference_curie)") - - - def get_curies_and_types(self, name, kg_name='KG1'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE name = ?", (name.upper(),) ) - rows = cursor.fetchall() - curies_and_types = [] - for row in rows: - curies_and_types.append({"curie": row[0], "type": row[2]}) - return curies_and_types - - - def get_curies_and_types_and_names(self, name, kg_name='KG1'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE name = ?", (name.upper(),) ) - rows = cursor.fetchall() - curies_and_types_and_names = [] - for row in rows: - names = self.get_names(row[0],kg_name=kg_name) - best_name = "?" - if names is not None: - best_name = names[0] - entity = {"curie": row[0], - "type": row[2], "name": best_name} - - # Also try to fetch the description from the knowledge graph - try: - properties = RU.get_node_properties(row[0]) - if 'description' in properties: - entity['description'] = properties['description'] - except: - # This will happen with this node is in KG2 but not KG1. FIXME - pass - curies_and_types_and_names.append(entity) - - return curies_and_types_and_names - - - def get_names(self, curie, kg_name='KG1'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE curie = ?", (curie,) ) - rows = cursor.fetchall() - - # Return a list of curies - curies = [] - for row in rows: - if row[1] == curie: - continue - curies.append(row[0]) - return curies - - - def get_curies(self, name, kg_name='KG1'): - curies_and_types = self.get_curies_and_types(name, kg_name) - - if curies_and_types is None: - return None - - # Return a list of curies - curies = [] - for curies_and_type in curies_and_types: - curies.append(curies_and_type["curie"]) - return(curies) - - - def is_curie_present(self, curie, kg_name='KG1'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE curie = ?", (curie,) ) - rows = cursor.fetchall() - - if len(rows) == 0: - return False - return True - - - def get_KG1_curies(self, name): - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg1node{TESTSUFFIX} WHERE name = ?", (name.upper(),) ) - rows = cursor.fetchall() - - if len(rows) == 0: - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg2node{TESTSUFFIX} WHERE name = ?", (name.upper(),) ) - rows = cursor.fetchall() - - curies = {} - curies_list = [] - for row in rows: - curie = row[3] - if curie not in curies: - if self.is_curie_present(curie): - curies_list.append(curie) - curies[curie] = 1 - return curies_list - - - def convert_curie(self, curie, namespace): - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg2node{TESTSUFFIX} WHERE name = ?", (curie.upper(),) ) - rows = cursor.fetchall() - - if len(rows) == 0: return [] - - reference_curie = rows[0][3] - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg2node{TESTSUFFIX} WHERE reference_curie = ?", (reference_curie,) ) - rows = cursor.fetchall() - - curies = {} - curies_list = [] - for row in rows: - curie = row[0] - match = re.match(namespace+':',curie) - if match: - if curie not in curies: - curies_list.append(curie) - curies[curie] = 1 - return curies_list - - - def get_equivalent_curies(self, curie, kg_name='KG2'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE curie = ?", (curie,) ) - rows = cursor.fetchall() - - if len(rows) == 0: return [] - - reference_curies = {} - reference_curie = None - for row in rows: - reference_curies[row[3]] = 1 - reference_curie = row[3] - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE reference_curie = ?", (reference_curie,) ) - rows = cursor.fetchall() - - curies = {} - for row in rows: - curies[row[0]] = 1 - - return list(curies.keys()) - - - def get_equivalent_entities(self, curie, kg_name='KG2'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - equivalence = { curie: { } } - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE curie = ?", (curie,) ) - rows = cursor.fetchall() - - if len(rows) == 0: return equivalence - - reference_curie = rows[0][3] - equivalence[curie]['id'] = { 'identifier': reference_curie } - equivalence[curie]['equivalent_identifiers'] = [] - equivalence[curie]['type'] = [ rows[0][2]] - - # What if there are multiple rows returned, this is not handled. FIXME - #reference_curies = {} - #for row in rows: - # reference_curies[row[3]] = 1 - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM {table_name}{TESTSUFFIX} WHERE reference_curie = ?", (reference_curie,) ) - rows = cursor.fetchall() - - curies = {} - for row in rows: - row_curie = row[0] - if row_curie not in curies: - equivalence[curie]['equivalent_identifiers'].append( { 'identifier': row_curie, 'label': row[1] } ) - if row_curie == curie: - equivalence[curie]['id']['label'] = row[1] - curies[row_curie] = 1 - - return equivalence - - - def get_total_entity_count(self, type, kg_name='KG1'): - - table_name = 'kg1node' - if kg_name.upper() == 'KG2': - table_name = 'kg2node' - - count = None - - cursor = self.connection.cursor() - cursor.execute( f"SELECT COUNT(DISTINCT reference_curie) FROM {table_name}{TESTSUFFIX} WHERE type = ?", (type,) ) - rows = cursor.fetchall() - - if len(rows) == 0: - return count - - return rows[0][0] - - - - def test_select(self, name): - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg1node{TESTSUFFIX} WHERE curie = ?", (name.upper(),) ) - rows = cursor.fetchall() - for row in rows: - print('KG1:',row) - - cursor = self.connection.cursor() - cursor.execute( f"SELECT * FROM kg2node{TESTSUFFIX} WHERE curie = ?", (name.upper(),) ) - rows = cursor.fetchall() - for row in rows: - print('KG2:',row) - - -#################################################################################################### -def main(): - - import json - - parser = argparse.ArgumentParser( - description="Tests or rebuilds the KG Node Index", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-b', '--build', action="store_true", - help="If set, (re)build the index from scratch", default=False) - parser.add_argument('-t', '--test', action="store_true", - help="If set, run a test of the index by doing several lookups", default=False) - args = parser.parse_args() - - if not args.build and not args.test: - parser.print_help() - sys.exit(2) - - kgNodeIndex = KGNodeIndex() - - # To (re)build - if args.build: - kgNodeIndex.create_tables() - kgNodeIndex.populate_table(kg_name='KG1') - kgNodeIndex.create_indexes(kg_name='KG1') - kgNodeIndex.populate_table(kg_name='KG2') - kgNodeIndex.create_indexes(kg_name='KG2') - - # Exit here if tests are not requested - if not args.test: - return - - print("==== Testing for finding curies by name ====") - tests = ["APS2", "phenylketonuria", "Gaucher's disease", "Gauchers disease", "Gaucher disease", - "Alzheimer Disease", "Alzheimers disease", "Alzheimer's Disease", "kidney", "KIDney", "P06865", "HEXA", - "UniProtKB:P12004", "rickets", "fanconi anemia", "retina", "is"] - - # The first one takes a bit longer, so do one before starting the timer - test = kgNodeIndex.get_curies("ibuprofen") - - t0 = timeit.default_timer() - for test in tests: - curies = kgNodeIndex.get_curies(test) - print(test+" = "+str(curies)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - - print("==== Testing presence of CURIEs ============================") - tests = ["REACT:R-HSA-2160456", "DOID:9281", "OMIM:261600", "DOID:1926xx", "HP:0002511", - "UBERON:0002113", "UniProtKB:P06865", "P06865", "KEGG:C10399", "GO:0034187", "DOID:10652xx"] - - t0 = timeit.default_timer() - for test in tests: - is_present = kgNodeIndex.is_curie_present(test) - print(test+" = "+str(is_present)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - - print("==== Getting properties by CURIE ============================") - tests = ["REACT:R-HSA-2160456", "DOID:9281", - "OMIM:261600", "DOID:1926xx", "P06865"] - - t0 = timeit.default_timer() - for test in tests: - node_properties = kgNodeIndex.get_curies_and_types_and_names(test) - print(test+" = "+str(node_properties)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - - print("==== Testing for KG1 and KG2 ============================") - tests = ["APS2", "phenylketonuria", "Gauchers disease", "kidney", "HEXA", - "UniProtKB:P12004", "fanconi anemia", "ibuprofen"] - - t0 = timeit.default_timer() - for test in tests: - curies = kgNodeIndex.get_curies(test) - print(test+" in KG1 = "+str(curies)) - curies = kgNodeIndex.get_curies(test, kg_name='KG2') - print(test+" in KG2 = "+str(curies)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - - print("==== Getting KG1 CURIEs ============================") - tests = ["CUI:C0031485", "CUI:C0017205", "UniProtKB:P06865", "MESH:D005199", "HEXA", - "CHEBI:5855", "fanconi anemia", "ibuprofen", 'DOID:9281'] - - t0 = timeit.default_timer() - for test in tests: - curies = kgNodeIndex.get_KG1_curies(test) - print(test+" = "+str(curies)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - print("==== Convert CURIEs to requested namespace ============================") - tests = [ [ "CUI:C0031485", "DOID" ], [ "FMA:7203", "UBERON" ], [ "MESH:D005199", "DOID" ], - [ "CHEBI:5855", "CHEMBL.COMPOUND" ], [ "ibuprofen", "CUI" ] ] - - t0 = timeit.default_timer() - for test in tests: - curies = kgNodeIndex.convert_curie(test[0], test[1]) - print(f"{test[0]} -> {test[1]} = " + str(curies)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - print("==== Get all known synonyms of a CURIE using KG2 index ============================") - tests = [ "DOID:14330", "CUI:C0031485", "FMA:7203", "MESH:D005199", "CHEBI:5855", "DOID:9281" ] - tests = [ "DOID:9281" ] - - t0 = timeit.default_timer() - for test in tests: - curies = kgNodeIndex.get_equivalent_curies(test,kg_name='KG1') - print(f"{test} = " + str(curies)) - curies = kgNodeIndex.get_equivalent_curies(test,kg_name='KG2') - print(f"{test} = " + str(curies)) - equivalence_mapping = kgNodeIndex.get_equivalent_entities(test,kg_name='KG1') - print(json.dumps(equivalence_mapping,sort_keys=True,indent=2)) - equivalence_mapping = kgNodeIndex.get_equivalent_entities(test,kg_name='KG2') - print(json.dumps(equivalence_mapping,sort_keys=True,indent=2)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - print("==== Get total number of drug nodes and disease nodes ============================") - t0 = timeit.default_timer() - kg = 'KG1' - print(kgNodeIndex.get_total_entity_count('chemical_substance', kg_name=kg)) - print(kgNodeIndex.get_total_entity_count('disease', kg_name=kg)) - print(kgNodeIndex.get_total_entity_count('protein', kg_name=kg)) - print(kgNodeIndex.get_total_entity_count('drug', kg_name=kg)) - print(kgNodeIndex.get_total_entity_count('cheesecake', kg_name=kg)) - t1 = timeit.default_timer() - print("Elapsed time: "+str(t1-t0)) - - #print("==== Test SELECT ============================") - #kgNodeIndex.test_select('phenylketonuria') - #kgNodeIndex.test_select('CUI:C4710278') - #kgNodeIndex.test_select('UniProtKB:P06865') - #print(kgNodeIndex.is_curie_present('CUI:C4710278')) - -#################################################################################################### -if __name__ == "__main__": - main() diff --git a/code/reasoningtool/kg-construction/Neo4jConnection.py b/code/reasoningtool/kg-construction/Neo4jConnection.py deleted file mode 100644 index 2c042287b..000000000 --- a/code/reasoningtool/kg-construction/Neo4jConnection.py +++ /dev/null @@ -1,661 +0,0 @@ -''' This module defines the class Neo4jConnection. Neo4jConnection class is designed -to connect to Neo4j database and perform operations on a graphic model object. (e.g., -retrieve node and update node) The available methods include: - - get_xxx_nodes : query all xxx nodes - update_xxx_nodes : update xxx nodes by an array 'nodes', which contain two properties 'node_id' - and 'extended_info_json' for each node - get_xxx_node : query xxx node by ID - - xxx is the type of nodes. (e.g., anatomy, phenotype, microRNA, pathway, protein, disease) - -''' - -__author__ = 'Deqing Qu' -__copyright__ = 'Oregon State University' -__credits__ = ['Deqing Qu', 'Stephen Ramsey'] -__license__ = 'MIT' -__version__ = '0.1.0' -__maintainer__ = '' -__email__ = '' -__status__ = 'Prototype' - -from neo4j.v1 import GraphDatabase - - -class Neo4jConnection: - - def __init__(self, uri, user, password): - self._driver = GraphDatabase.driver(uri, auth=(user, password)) - - def close(self): - self._driver.close() - - def get_anatomy_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_anatomy_nodes) - - def get_phenotype_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_phenotype_nodes) - - def get_microRNA_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_microRNA_nodes) - - def get_pathway_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_pathway_nodes) - - def get_protein_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_protein_nodes) - - def get_disease_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_disease_nodes) - - def get_chemical_substance_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_chemical_substance_nodes) - - def get_bio_process_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_bio_process_nodes) - - def get_cellular_component_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_cellular_component_nodes) - - def get_molecular_function_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self._get_molecular_function_nodes) - - def get_metabolite_nodes(self): - with self._driver.session() as session: - return session.read_transaction(self._get_metabolite_nodes) - - def update_anatomy_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_anatomy_nodes, nodes) - - def update_phenotype_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_phenotype_nodes, nodes) - - def update_microRNA_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_microRNA_nodes, nodes) - - def update_pathway_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_pathway_nodes, nodes) - - def update_protein_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_protein_nodes, nodes) - - def update_disease_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_disease_nodes, nodes) - - def update_chemical_substance_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_chemical_substance_nodes, nodes) - - def update_bio_process_nodes(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_bio_process_nodes, nodes) - - def get_anatomy_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_anatomy_node, id) - - def get_phenotype_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_phenotype_node, id) - - def get_microRNA_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_microRNA_node, id) - - def get_pathway_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_pathway_node, id) - - def get_protein_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_protein_node, id) - - def get_disease_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_disease_node, id) - - def get_chemical_substance_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_chemical_substance_node, id) - - def get_bio_process_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_bio_process_node, id) - - def get_node(self, id): - with self._driver.session() as session: - return session.write_transaction(self._get_node, id) - - def update_anatomy_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_anatomy_nodes_desc, nodes) - - def update_phenotype_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_phenotype_nodes_desc, nodes) - - def update_microRNA_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_microRNA_nodes_desc, nodes) - - def update_pathway_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_pathway_nodes_desc, nodes) - - def update_protein_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_protein_nodes_desc, nodes) - - def update_disease_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_disease_nodes_desc, nodes) - - def update_chemical_substance_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_chemical_substance_nodes_desc, nodes) - - def update_bio_process_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_bio_process_nodes_desc, nodes) - - def update_cellular_component_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_cellular_component_desc, nodes) - - def update_molecular_function_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_molecular_function_desc, nodes) - - def update_protein_nodes_name(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_protein_nodes_name, nodes) - - def update_metabolite_nodes_desc(self, nodes): - with self._driver.session() as session: - return session.write_transaction(self._update_metabolite_desc, nodes) - - def get_node_names(self, type): - with self._driver.session() as session: - return session.write_transaction(self._get_node_names, type) - - def create_disease_has_phenotype(self, array): - with self._driver.session() as session: - return session.write_transaction(self.__create_disease_has_phenotype, array) - - def remove_duplicate_has_phenotype_relations(self): - with self._driver.session() as session: - return session.write_transaction(self.__remove_duplicate_has_phenotype_relations) - - def count_has_phenotype_relation(self, relation): - """ - - :param relation: {"d_id": "DOID:xxxx", "p_id": "HP:xxxx"} - :return: count of relations between d_id and p_id - """ - with self._driver.session() as session: - return session.write_transaction(self.__count_has_phenotype_relation, relation) - - def remove_duplicated_react_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self.__remove_duplicated_react_nodes) - - def count_duplicated_nodes(self): - with self._driver.session() as session: - return session.write_transaction(self.__count_duplicated_nodes) - - def get_relationship(self, r_type, s_id, t_id): - with self._driver.session() as session: - return session.write_transaction(self._get_relationship, r_type, s_id, t_id) - - @staticmethod - def _get_anatomy_nodes(tx): - result = tx.run("MATCH (n:anatomical_entity) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_phenotype_nodes(tx): - result = tx.run("MATCH (n:phenotypic_feature) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_microRNA_nodes(tx): - result = tx.run("MATCH (n:microRNA) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_pathway_nodes(tx): - result = tx.run("MATCH (n:pathway) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_protein_nodes(tx): - result = tx.run("MATCH (n:protein) RETURN n.id") - return [record["n.id"] for record in result] - - @staticmethod - def _get_disease_nodes(tx): - result = tx.run("MATCH (n:disease) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_chemical_substance_nodes(tx): - result = tx.run("MATCH (n:chemical_substance) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_bio_process_nodes(tx): - result = tx.run("MATCH (n:biological_process) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_cellular_component_nodes(tx): - result = tx.run("MATCH (n:cellular_component) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_molecular_function_nodes(tx): - result = tx.run("MATCH (n:molecular_function) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _get_metabolite_nodes(tx): - result = tx.run("MATCH (n:metabolite) RETURN n.rtx_name") - return [record["n.rtx_name"] for record in result] - - @staticmethod - def _update_anatomy_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:anatomical_entity{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_phenotype_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:phenotypic_feature{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_microRNA_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:microRNA{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_pathway_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:pathway{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_protein_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:protein{id:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_disease_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:disease{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_chemical_substance_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:chemical_substance{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_bio_process_nodes(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.extended_info_json AS extended_info_json - MATCH (n:biological_process{rtx_name:node_id}) - SET n.extended_info_json=extended_info_json - """, - nodes=nodes, - ) - return result - - @staticmethod - def _get_anatomy_node(tx, id): - result = tx.run("MATCH (n:anatomical_entity{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_phenotype_node(tx, id): - result = tx.run("MATCH (n:phenotypic_feature{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_microRNA_node(tx, id): - result = tx.run("MATCH (n:microRNA{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_pathway_node(tx, id): - result = tx.run("MATCH (n:pathway{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_protein_node(tx, id): - result = tx.run("MATCH (n:protein{id:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_disease_node(tx, id): - result = tx.run("MATCH (n:disease{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_chemical_substance_node(tx, id): - result = tx.run("MATCH (n:chemical_substance{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_bio_process_node(tx, id): - result = tx.run("MATCH (n:biological_process{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _get_node(tx, id): - result = tx.run("MATCH (n{rtx_name:'%s'}) RETURN n" % id) - return result.single() - - @staticmethod - def _update_anatomy_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:anatomical_entity{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_phenotype_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:phenotypic_feature{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_microRNA_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:microRNA{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_disease_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:disease{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_pathway_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:pathway{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_protein_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:protein{id:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_chemical_substance_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:chemical_substance{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_bio_process_nodes_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:biological_process{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_cellular_component_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:cellular_component{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_molecular_function_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:molecular_function{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_protein_nodes_name(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.name AS name - MATCH (n:protein{id:node_id}) - SET n.name=name - """, - nodes=nodes, - ) - return result - - @staticmethod - def _update_metabolite_desc(tx, nodes): - result = tx.run( - """ - UNWIND {nodes} AS row - WITH row.node_id AS node_id, row.desc AS description - MATCH (n:metabolite{rtx_name:node_id}) - SET n.description=description - """, - nodes=nodes, - ) - return result - - @staticmethod - def _get_node_names(tx, type): - result = tx.run("MATCH (n:%s) RETURN n.name" % type) - return [record["n.name"] for record in result] - - @staticmethod - def __create_disease_has_phenotype(tx, array): - result = tx.run( - """ - UNWIND {array} AS row - WITH row.d_id AS d_id, row.p_id AS p_id - MATCH (d:disease {rtx_name:d_id}), (p:phenotypic_feature {rtx_name:p_id}) - CREATE (d)-[:has_phenotype { - source_node_uuid: d.UUID, - target_node_uuid: p.UUID, - is_defined_by: \'RTX\', - provided_by: \'BioLink\', - predicate: \'has_phenotype\', - seed_node_uuid: d.seed_node_uuid, - relation: \'has_phenotype\' - }]->(p) - """, - array=array - ) - return result - - @staticmethod - def __remove_duplicate_has_phenotype_relations(tx): - result = tx.run( - """ - MATCH (a)-[r:has_phenotype]->(b) - WITH a, b, TAIL (COLLECT (r)) as rr - WHERE size(rr)>0 - FOREACH (r IN rr | DELETE r) - """ - ) - return result - - @staticmethod - def __count_has_phenotype_relation(tx, relation): - result = tx.run( - """ - MATCH p = (a {rtx_name:$relation.d_id})-[r:has_phenotype]->(b {rtx_name:$relation.p_id}) - RETURN count(p) - """, - relation=relation - ) - return result.single()['count(p)'] - - @staticmethod - def __remove_duplicated_react_nodes(tx): - result = tx.run( - """ - MATCH (n), (m) - WHERE n<>m AND n.id=m.id AND split(n.rtx_name, ':')[0] = 'REACT' - DELETE n - """ - ) - return result - - @staticmethod - def __count_duplicated_nodes(tx): - result = tx.run( - """ - MATCH (n), (m) - WHERE n<>m AND n.id=m.id return count(*) - """, - ) - return result.single()['count(*)'] - - @staticmethod - def _get_relationship(tx, r_type, s_id, t_id): - result = tx.run("MATCH p=()-[r:%s]->() where r.source_node_uuid= '%s' and r.target_node_uuid='%s' RETURN r" % - (r_type, s_id, t_id)) - return result.single() diff --git a/code/reasoningtool/kg-construction/NormGoogleDistance.py b/code/reasoningtool/kg-construction/NormGoogleDistance.py index 9746ac6ed..b07cf9a04 100644 --- a/code/reasoningtool/kg-construction/NormGoogleDistance.py +++ b/code/reasoningtool/kg-construction/NormGoogleDistance.py @@ -14,7 +14,6 @@ from cache_control_helper import CacheControlHelper from QueryNCBIeUtils import QueryNCBIeUtils -from QueryDisont import QueryDisont # DOID -> MeSH from QueryEBIOLS import QueryEBIOLS # UBERON -> MeSH from QueryMyChem import QueryMyChem import sqlite3 @@ -94,7 +93,7 @@ def get_mesh_term_for_all(curie_id, description): "UBERON" + "CL" - not supposed to be here? "NCBIGene" + - "DOID" + + "DOID" - "OMIM" + "ChEMBL" + """ @@ -148,18 +147,10 @@ def get_mesh_term_for_all(curie_id, description): elif curie_list[0] == "NCBIGene": gene_id = curie_id.split(':')[1] names = QueryNCBIeUtils.get_pubmed_from_ncbi_gene(gene_id) + elif curie_list[0] == "MONDO": + names = list(QueryEBIOLS.get_mesh_id_for_mondo_id(curie_id)) elif curie_list[0] == "DOID": - mesh_id = QueryDisont.query_disont_to_mesh_id(curie_id) - names = [] - for uid in mesh_id: - uid_num = int(uid[1:]) + 68000000 - name = QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) - if name is not None: - names += name - if len(names) == 0: - names = None - else: - names[0] = names[0] + '[MeSH Terms]' + raise ValueError(f"NormGoogleDistance.py is now unable to obtain a MeSH ID from a DOID {curie_id}") elif curie_list[0] == "OMIM": names = QueryNCBIeUtils.get_mesh_terms_for_omim_id(curie_list[1]) elif curie_list[0] == "ChEMBL": @@ -176,44 +167,6 @@ def get_mesh_term_for_all(curie_id, description): return names return [description.replace(';', '|')] - @staticmethod - # @CachedMethods.register - def get_ngd_for_all(curie_id_list, description_list): - """ - Takes a list of currie ids and descriptions then calculates the normalized google distance for the set of nodes. - Params: - curie_id_list - a list of strings containing the curie ids of the nodes. Formatted : e.g. DOID:8398 - description_list - a list of strings containing the English names for the nodes - """ - assert len(curie_id_list) == len(description_list) - terms = [None] * len(curie_id_list) - for a in range(len(description_list)): - terms[a] = NormGoogleDistance.get_mesh_term_for_all(curie_id_list[a], description_list[a]) - if type(terms[a]) != list: - terms[a] = [terms[a]] - if len(terms[a]) == 0: - terms[a] = [description_list[a]] - if len(terms[a]) > 30: - terms[a] = terms[a][:30] - terms_combined = [''] * len(terms) - mesh_flags = [True] * len(terms) - for a in range(len(terms)): - if len(terms[a]) > 1: - if not terms[a][0].endswith('[uid]'): - for b in range(len(terms[a])): - if QueryNCBIeUtils.is_mesh_term(terms[a][b]) and not terms[a][b].endswith('[MeSH Terms]'): - terms[a][b] += '[MeSH Terms]' - terms_combined[a] = '|'.join(terms[a]) - mesh_flags[a] = False - else: - terms_combined[a] = terms[a][0] - if terms[a][0].endswith('[MeSH Terms]'): - terms_combined[a] = terms[a][0][:-12] - elif not QueryNCBIeUtils.is_mesh_term(terms[a][0]): - mesh_flags[a] = False - ngd = QueryNCBIeUtils.multi_normalized_google_distance(terms_combined, mesh_flags) - return ngd - @staticmethod def api_ngd(mesh_term1, mesh_term2): response = {} diff --git a/code/reasoningtool/kg-construction/QueryChEMBL.py b/code/reasoningtool/kg-construction/QueryChEMBL.py deleted file mode 100644 index c87de8056..000000000 --- a/code/reasoningtool/kg-construction/QueryChEMBL.py +++ /dev/null @@ -1,202 +0,0 @@ -''' Queries the ChEMBL database to find target proteins for drugs. -''' - -__author__ = 'Stephen Ramsey' -__copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey'] -__license__ = 'MIT' -__version__ = '0.1.0' -__maintainer__ = '' -__email__ = '' -__status__ = 'Prototype' - -import urllib -# import requests -# import requests_cache -import sys -from cache_control_helper import CacheControlHelper - -from QueryUniprot import QueryUniprot - - -class QueryChEMBL: - API_BASE_URL = 'https://www.ebi.ac.uk/chembl/api/data' - TIMEOUT_SEC = 120 - - @staticmethod - def send_query_get(handler, url_suffix): - - requests = CacheControlHelper() - url = QueryChEMBL.API_BASE_URL + '/' + handler + '?' + url_suffix -# print(url) - try: - res = requests.get(url, - timeout=QueryChEMBL.TIMEOUT_SEC) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryChEMBL for URL: ' + url, file=sys.stderr) - return None - except KeyboardInterrupt: - sys.exit(0) - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryChEMBL for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(url, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - return res.json() - - @staticmethod - def get_chembl_ids_for_drug(drug_name): - if not isinstance(drug_name, str): - return set() - - drug_name_safe = urllib.parse.quote(drug_name, safe='') - res = QueryChEMBL.send_query_get(handler='compound_record.json', - url_suffix='compound_name__iexact=' + drug_name_safe) - res_chembl_set = set() - if res is not None: - compound_records = res.get('compound_records', None) - if compound_records is not None: - for compound_record in compound_records: - chembl_id = compound_record.get('molecule_chembl_id', None) - if chembl_id is not None: - res_chembl_set.add(chembl_id) - return res_chembl_set - - @staticmethod - def get_target_uniprot_ids_for_chembl_id(chembl_id): - print(chembl_id, file=sys.stderr) - if not isinstance(chembl_id, str): - return dict() - - res_targets_dict = dict() - - target_mechanisms_json = QueryChEMBL.get_mechanisms_for_chembl_id(chembl_id) - for target_mechanism in target_mechanisms_json: - target_chembl_id = target_mechanism.get("target_chembl_id", None) - if target_chembl_id is not None: - target_uniprot_ids = QueryChEMBL.map_chembl_target_to_uniprot_ids(target_chembl_id) - for target_uniprot_id in target_uniprot_ids: - res_targets_dict[target_uniprot_id] = float(1.0) - - res = QueryChEMBL.send_query_get(handler='target_prediction.json', - url_suffix='molecule_chembl_id__exact=' + chembl_id + '&target_organism__exact=Homo%20sapiens') - if res is not None: - target_predictions_list = res.get('target_predictions', None) - if target_predictions_list is not None: - for target_prediction in target_predictions_list: - # print(target_prediction) - target_uniprot_id = target_prediction.get('target_accession', None) - target_probability = target_prediction.get('probability', None) - if target_uniprot_id is not None: - target_organism = target_prediction.get('target_organism', None) - if target_organism is not None: - assert target_organism == "Homo sapiens" - # need to get the gene ID for this Uniprot ID - if target_uniprot_id not in res_targets_dict: - res_targets_dict[target_uniprot_id] = float(target_probability) - - return res_targets_dict - - @staticmethod - def map_chembl_target_to_uniprot_ids(target_chembl_id): - res_json = QueryChEMBL.send_query_get(handler="target.json", - url_suffix="target_chembl_id=" + target_chembl_id) - res_set = set() -# print(res_json) - if res_json is not None: - targets = res_json.get("targets", None) - if targets is not None and len(targets) > 0: - for target in targets: - components = target.get("target_components", None) - if components is not None: - for component in components: - xrefs = component.get("target_component_xrefs", None) - if xrefs is not None: - for xref in xrefs: - if xref is not None: - xref_src_db = xref.get("xref_src_db", None) - if xref_src_db is not None: - if xref_src_db == "UniProt": - uniprot_id = xref.get("xref_id", None) - if uniprot_id is not None: - uniprot_id_citeable = QueryUniprot.get_citeable_accession_for_accession(uniprot_id) - if uniprot_id_citeable is not None: - res_set |= set([uniprot_id_citeable]) - return res_set - - @staticmethod - def get_target_uniprot_ids_for_drug(drug_name): - if not isinstance(drug_name, str): - return dict() - - chembl_ids_for_drug = QueryChEMBL.get_chembl_ids_for_drug(drug_name) - res_uniprot_ids = dict() - for chembl_id in chembl_ids_for_drug: - # print(chembl_id) - uniprot_ids_dict = QueryChEMBL.get_target_uniprot_ids_for_chembl_id(chembl_id) - for uniprot_id in uniprot_ids_dict.keys(): - res_uniprot_ids[uniprot_id] = uniprot_ids_dict[uniprot_id] - return res_uniprot_ids - - @staticmethod - def get_mechanisms_for_chembl_id(chembl_id): - """Retrieves mechanism of action and target of each drug. - - Args: - chembl_id (str): a ChEMBL id, e.g., "CHEMBL521" - - Returns: - array: an array of mechanism of actions, or [] if no mechanism data could be obtained for the given - ChEMBL ID - - example: - [ - {"action_type": "INHIBITOR", - "binding_site_comment": null, - "direct_interaction": true, - "disease_efficacy": true, - "max_phase": 4, - "mec_id": 1180, - "mechanism_comment": null, - "mechanism_of_action": "Cyclooxygenase inhibitor", - "mechanism_refs": [ - {"ref_id": "0443-059748 PP. 229", - "ref_type": "ISBN", - "ref_url": "http://www.isbnsearch.org/isbn/0443059748" - }, - {"ref_id": "Ibuprofen", - "ref_type": "Wikipedia", - "ref_url": "http://en.wikipedia.org/wiki/Ibuprofen"} - ], - "molecular_mechanism": true, - "molecule_chembl_id": "CHEMBL521", - "record_id": 1343587, - "selectivity_comment": null, - "site_id": null, - "target_chembl_id": "CHEMBL2094253"} - ] - """ - if not isinstance(chembl_id, str): - return [] - - res = QueryChEMBL.send_query_get(handler='mechanism.json', - url_suffix='molecule_chembl_id=' + chembl_id) - res_mechanisms_array = [] - if res is not None: - mechanism_records = res.get('mechanisms', None) - if mechanism_records is not None and len(mechanism_records) > 0: - res_mechanisms_array = mechanism_records - return res_mechanisms_array - - -if __name__ == '__main__': - print(QueryChEMBL.get_target_uniprot_ids_for_chembl_id('CHEMBL521')) - print(QueryChEMBL.get_target_uniprot_ids_for_chembl_id('CHEMBL2364648')) -# print(QueryChEMBL.get_mechanisms_for_chembl_id("CHEMBL521")) -# print(QueryChEMBL.map_chembl_target_to_uniprot_ids("CHEMBL2094253")) -# print(QueryChEMBL.get_mechanisms_for_chembl_id("CHEMBL521")) diff --git a/code/reasoningtool/kg-construction/QueryDisont.py b/code/reasoningtool/kg-construction/QueryDisont.py deleted file mode 100644 index 5db49c02d..000000000 --- a/code/reasoningtool/kg-construction/QueryDisont.py +++ /dev/null @@ -1,114 +0,0 @@ -""" This module is the definition of class QueryDisont. It is written to connect - with disease-ontology to query disease ontology and mesh id of given disont_id. -""" - -__author__ = "" -__copyright__ = "" -__credits__ = [] -__license__ = "" -__version__ = "" -__maintainer__ = "" -__email__ = "" -__status__ = "Prototype" - -# import requests -import sys - -from cache_control_helper import CacheControlHelper - -class QueryDisont: - TIMEOUT_SEC = 120 - API_BASE_URL = 'http://www.disease-ontology.org/api' - - @staticmethod - def send_query_get(handler, url_suffix): - - requests = CacheControlHelper() - url = QueryDisont.API_BASE_URL + "/" + handler + "/" + url_suffix -# print(url_str) - try: - res = requests.get(url, timeout=QueryDisont.TIMEOUT_SEC) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryDisont for URL: ' + url, file=sys.stderr) - return None - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryDisont for URL: %s' % (e, url), file=sys.stderr) - return None - - status_code = res.status_code - if status_code != 200: - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - return res - - @staticmethod - def query_disont_to_child_disonts(disont_id): - """for a disease ontology ID (including prefix "DOID:", with zero padding), return child DOIDs - - :param disont_id: string, like ``'DOID:14069'`` - :returns: ``set`` with keys as DOIDs - """ - res = QueryDisont.send_query_get('metadata', disont_id) - ret_set = set() - if res is not None: - res_json = res.json() -# print(res_json) - disease_children_list = res_json.get("children", None) - if disease_children_list is not None: - ret_set |= set([int(disease_child_list[1].split(':')[1]) for disease_child_list in disease_children_list]) - return ret_set - - @staticmethod - def query_disont_to_label(disont_id): - res = QueryDisont.send_query_get('metadata', disont_id) - ret_label = '' - if res is not None: - res_json = res.json() - ret_label = res_json.get('name', '') - return ret_label - - @staticmethod - def query_disont_to_child_disonts_desc(disont_id): - """for a disease ontology ID (including prefix "DOID:", with zero padding), return child DOIDs - - :param disont_id: string, like ``'DOID:14069'`` - :returns: ``dict`` with keys as DOIDs and values as human-readable disease names - """ - - res = QueryDisont.send_query_get('metadata', disont_id) - ret_dict = dict() - if res is not None: - res_json = res.json() -# print(res_json) - disease_children_list = res_json.get("children", None) - if disease_children_list is not None: - ret_dict = dict([[disease_child_list[1], disease_child_list[0]] for disease_child_list in disease_children_list]) - return ret_dict - - @staticmethod - def query_disont_to_mesh_id(disont_id): - """convert a disease ontology ID (including prefix "DOID:", with zero padding) to MeSH ID - - :param disont_id: string, like ``'DOID:14069'`` - """ - res = QueryDisont.send_query_get('metadata', disont_id) - ret_set = set() - if res is not None: - res_json = res.json() - xref_strs = res_json.get("xrefs", None) - if xref_strs is not None: - ret_set |= set([xref_str.split('MESH:')[1] for xref_str in xref_strs if 'MESH:' in xref_str]) - return ret_set - -if __name__ == '__main__': - print(QueryDisont.query_disont_to_label("DOID:0050741")) - print(QueryDisont.query_disont_to_mesh_id("DOID:9352")) - print(QueryDisont.query_disont_to_mesh_id("DOID:1837")) - print(QueryDisont.query_disont_to_mesh_id("DOID:10182")) - print(QueryDisont.query_disont_to_mesh_id("DOID:11712")) - print(QueryDisont.query_disont_to_child_disonts_desc("DOID:9352")) - print(QueryDisont.query_disont_to_mesh_id("DOID:14069")) - print(QueryDisont.query_disont_to_child_disonts_desc("DOID:12365")) - print(QueryDisont.query_disont_to_mesh_id("DOID:0050741")) diff --git a/code/reasoningtool/kg-construction/QueryEBIOLS.py b/code/reasoningtool/kg-construction/QueryEBIOLS.py index 65201e3f2..20c499bab 100644 --- a/code/reasoningtool/kg-construction/QueryEBIOLS.py +++ b/code/reasoningtool/kg-construction/QueryEBIOLS.py @@ -19,7 +19,7 @@ class QueryEBIOLS: TIMEOUT_SEC = 120 - API_BASE_URL = "https://www.ebi.ac.uk/ols/api/ontologies" + API_BASE_URL = "https://www.ebi.ac.uk/ols4/api/ontologies" HANDLER_MAP = { 'get_anatomy': '{ontology}/terms/{id}', 'get_phenotype': '{ontology}/terms/{id}', @@ -65,7 +65,7 @@ def get_bto_term_for_bto_id(bto_curie_id): """ bto_iri = "http://purl.obolibrary.org/obo/" + bto_curie_id.replace(":", "_") bto_iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(bto_iri)) - res = QueryEBIOLS.send_query_get("bto/terms/", bto_iri_double_encoded) + res = QueryEBIOLS.send_query_get("bto/terms", bto_iri_double_encoded) ret_label = None if res is not None: res_json = res.json() @@ -81,7 +81,7 @@ def get_bto_id_for_uberon_id(uberon_curie_id): """ uberon_iri = "http://purl.obolibrary.org/obo/" + uberon_curie_id.replace(":", "_") uberon_iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(uberon_iri)) - res = QueryEBIOLS.send_query_get("uberon/terms/", uberon_iri_double_encoded) + res = QueryEBIOLS.send_query_get("uberon/terms", uberon_iri_double_encoded) ret_list = list() if res is not None: res_json = res.json() @@ -101,7 +101,7 @@ def get_mesh_id_for_uberon_id(uberon_curie_id): """ uberon_iri = "http://purl.obolibrary.org/obo/" + uberon_curie_id.replace(":", "_") uberon_iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(uberon_iri)) - res = QueryEBIOLS.send_query_get("uberon/terms/", uberon_iri_double_encoded) + res = QueryEBIOLS.send_query_get("uberon/terms", uberon_iri_double_encoded) ret_list = list() if res is not None: res_json = res.json() @@ -178,29 +178,30 @@ def get_cellular_component_description(cc_id): def get_molecular_function_description(mf_id): return QueryEBIOLS.__get_entity("get_molecular_function", mf_id) + @staticmethod + def get_mesh_id_for_mondo_id(mondo_curie_id): + """ + Converts a disease MONDO ID to MeSH id + :param mondo_curie_id: eg. "MONDO:0005148" + :return: a set of MeSH id's (eg. {"MESH:D003924"}) + """ + mondo_iri = "http://purl.obolibrary.org/obo/" + mondo_curie_id.replace(":", "_") + mondo_iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(mondo_iri)) + res = QueryEBIOLS.send_query_get("mondo/terms", mondo_iri_double_encoded) + ret_list = list() + if res is not None: + res_json = res.json() + res_annotation = res_json.get("annotation", None) + if res_annotation is not None: + db_x_refs = res_annotation.get("database_cross_reference", None) + if db_x_refs is not None: + ret_list = [mesh_id for mesh_id in db_x_refs if "MESH:" in mesh_id] + return set(ret_list) + if __name__ == "__main__": print(QueryEBIOLS.get_bto_id_for_uberon_id("UBERON:0000178")) print(QueryEBIOLS.get_bto_term_for_bto_id("BTO:0000089")) print(QueryEBIOLS.get_mesh_id_for_uberon_id("UBERON:0002107")) print(QueryEBIOLS.get_mesh_id_for_uberon_id("UBERON:0001162")) - - def save_to_test_file(key, value): - f = open('tests/query_desc_test_data.json', 'r+') - try: - json_data = json.load(f) - except ValueError: - json_data = {} - f.seek(0) - f.truncate() - json_data[key] = value - json.dump(json_data, f) - f.close() - - save_to_test_file('UBERON:0004476', QueryEBIOLS.get_anatomy_description('UBERON:0004476')) - save_to_test_file('CL:0000038', QueryEBIOLS.get_anatomy_description('CL:0000038')) - save_to_test_file('GO:0042535', QueryEBIOLS.get_bio_process_description('GO:0042535')) - save_to_test_file('HP:0011105', QueryEBIOLS.get_phenotype_description('HP:0011105')) - save_to_test_file('GO:0005573', QueryEBIOLS.get_cellular_component_description('GO:0005573')) - save_to_test_file('GO:0004689', QueryEBIOLS.get_molecular_function_description('GO:0004689')) - save_to_test_file('OMIM:604348', QueryEBIOLS.get_disease_description('OMIM:604348')) + print(QueryEBIOLS.get_mesh_id_for_mondo_id("MONDO:0005148")) diff --git a/code/reasoningtool/kg-construction/QueryMyChem.py b/code/reasoningtool/kg-construction/QueryMyChem.py index 1b3aecede..3693c42d5 100644 --- a/code/reasoningtool/kg-construction/QueryMyChem.py +++ b/code/reasoningtool/kg-construction/QueryMyChem.py @@ -21,9 +21,6 @@ import sys import json -from QueryPubChem import QueryPubChem - - class QueryMyChem: TIMEOUT_SEC = 120 API_BASE_URL = 'http://mychem.info/v1' @@ -201,7 +198,6 @@ def get_meddra_codes_for_side_effects(chembl_id): return meddra_code_set if chembl_id[:7].upper() == "CHEMBL:": chembl_id = "CHEMBL" + chembl_id[7:] - # pubchem_id = QueryPubChem.get_pubchem_id_for_chembl_id(chembl_id) pubchem_id = QueryMyChem.get_pubchem_cid(chembl_id) if pubchem_id is None: return meddra_code_set diff --git a/code/reasoningtool/kg-construction/QueryMyGene.py b/code/reasoningtool/kg-construction/QueryMyGene.py deleted file mode 100644 index f96c42352..000000000 --- a/code/reasoningtool/kg-construction/QueryMyGene.py +++ /dev/null @@ -1,530 +0,0 @@ -""" This module defines the class QueryMyGene. -QueryMyGene is written to query gene annotation information via python package -mygene. It can convert among gene symbol, uniprot id, entrez gene id, mirbase id. -""" - -__author__ = "" -__copyright__ = "" -__credits__ = [] -__license__ = "" -__version__ = "" -__maintainer__ = "" -__email__ = "" -__status__ = "Prototype" - -# import mygene -import sys -# import requests -import json -# import requests_cache - -from cache_control_helper import CacheControlHelper - - -class QueryMyGene: - def __init__(self, debug=False): - # self.mygene_obj = mygene.MyGeneInfo() - self.debug = debug - - ONT_NAME_TO_SIMPLE_NODE_TYPE = {'BP': 'biological_process', - 'MF': 'molecular_function', - 'CC': 'cellular_component'} - - TIMEOUT_SEC = 120 - API_BASE_URL = 'http://mygene.info/v3' - HANDLER_MAP = { - 'query': 'query', - 'gene': 'gene' - } - - @staticmethod - def __access_api(handler, url_suffix, params=None, return_raw=False): - - requests = CacheControlHelper() - if url_suffix: - url = QueryMyGene.API_BASE_URL + '/' + handler + '?' + url_suffix - else: - url = QueryMyGene.API_BASE_URL + '/' + handler - headers = {'user-agent': "mygene.py/%s python-requests/%s" % ("1.0.0", "1.0.0"), 'Accept': 'application/json'} - try: - res = requests.get(url, params=params, timeout=QueryMyGene.TIMEOUT_SEC, headers=headers) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryMyGene for URL: ' + url, file=sys.stderr) - return None - except KeyboardInterrupt: - sys.exit(0) - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryMyGene for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(url, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - if return_raw: - return res.text - else: - return res.json() - - @staticmethod - def unnest(lst, skip_type): - """ - To unnest a list like `["foo", ["bar", "baz"]]` to `["foo", "bar", "baz"]`. - Elements of `skip_type` will be leaf as is. - """ - def generate_elements(lst, skip_type): - for e in lst: - if isinstance(e, skip_type): - yield e - else: - yield from e - - return list(generate_elements(lst, skip_type)) - - def convert_gene_symbol_to_uniprot_id(self, gene_symbol): - # try: - # res = self.mygene_obj.query('symbol:' + gene_symbol, species='human', - # fields='uniprot', verbose=False) - # except requests.exceptions.HTTPError: - # print('HTTP error for querying gene symbol to uniprot in mygene: ' + gene_symbol, file=sys.stderr) - # res = None - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=symbol:" + gene_symbol + "&species=human&fields=uniprot" - res = QueryMyGene.__access_api(handler, url_suffix) - - uniprot_ids_set = set() - if res is not None and len(res) > 0: - uniprot_ids_list = [] - for hit in res['hits']: - uniprot_hit = hit.get("uniprot", None) - if uniprot_hit is not None: - uniprot_id = uniprot_hit.get("Swiss-Prot", None) - if uniprot_id is not None: - uniprot_ids_list.append(uniprot_id) - else: - if self.debug: - print("Could not find Uniprot ID for gene symbol: " + gene_symbol) - uniprot_ids_list = QueryMyGene.unnest(uniprot_ids_list, str) - uniprot_ids_set = set(uniprot_ids_list) - return uniprot_ids_set - - def convert_uniprot_id_to_gene_symbol(self, uniprot_id): - # try: - # res = self.mygene_obj.query('uniprot:' + uniprot_id, species='human', - # fields='symbol', verbose=False) - # except requests.exceptions.HTTPError: - # print('HTTP error for querying uniprot to gene symbol mygene: ' + uniprot_id, file=sys.stderr) - # res = None - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=uniprot:" + uniprot_id + "&species=human&fields=symbol" - res = QueryMyGene.__access_api(handler, url_suffix) - - gene_symbol = set() - if res is not None and len(res) > 0: - res_hits = res.get('hits', None) - if res_hits is not None: - gene_symbol = set([hit['symbol'] for hit in res_hits]) - else: - print("QueryMyGene.convert_uniprot_id_to_gene_symbol: no \'hits\' result data for uniprot_id: " + uniprot_id, file=sys.stderr) - gene_symbol = set([hit["symbol"] for hit in res_hits]) - return gene_symbol - - def convert_uniprot_id_to_entrez_gene_ID(self, uniprot_id): - # requests = CacheControlHelper() - # try: - # res = self.mygene_obj.query('uniprot:' + uniprot_id, species='human', - # fields='entrezgene', verbose=False) - # except requests.exceptions.HTTPError: - # print('HTTP error for querying uniprot-to-entrezgene in mygene: ' + uniprot_id, file=sys.stderr) - # res = None - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=uniprot:" + uniprot_id + "&species=human&fields=entrezgene" - res = QueryMyGene.__access_api(handler, url_suffix) - - entrez_ids = set() - if res is not None and len(res) > 0: - res_hits = res.get('hits', None) - if res_hits is not None: - for hit in res_hits: - entrez_id = hit.get('entrezgene', None) - if entrez_id is not None: - entrez_ids.add(entrez_id) -# entrez_ids = set([hit["entrezgene"] for hit in res_hits]) - else: - print("QueryMyGene.convert_uniprot_id_to_entrez_gene_ID: no \'hits\' result data for uniprot_id: " + uniprot_id, file=sys.stderr) - return entrez_ids - - def convert_hgnc_gene_id_to_uniprot_id(self, hgnc_id): - uniprot_ids = set() - - # requests = CacheControlHelper() - # try: - # res = self.mygene_obj.query(hgnc_id, species='human', - # fields='uniprot', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + hgnc_id, file=sys.stderr) - # return uniprot_ids - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=" + hgnc_id + "&species=human&fields=uniprot" - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None and len(res) > 0: - for hit in res['hits']: - uniprot_id_dict = hit.get('uniprot', None) - if uniprot_id_dict is not None: - uniprot_id = uniprot_id_dict.get('Swiss-Prot', None) - if uniprot_id is not None: - if type(uniprot_id) == str: - uniprot_ids.add(uniprot_id) - else: - uniprot_ids.union(uniprot_id) - return uniprot_ids - - def convert_gene_symbol_to_entrez_gene_ID(self, gene_symbol): - entrez_ids = set() - - # requests = CacheControlHelper() - # try: - # res = self.mygene_obj.query('symbol:' + gene_symbol, species='human', - # fields='entrezgene', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + gene_symbol, file=sys.stderr) - # return entrez_ids - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=symbol:" + gene_symbol + "&species=human&fields=entrezgene" - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None and len(res) > 0: - entrez_ids = set() - for hit in res['hits']: - entrez_id = hit.get('entrezgene', None) - if entrez_id is not None: - entrez_ids.add(entrez_id) - return entrez_ids - - def convert_entrez_gene_id_to_uniprot_id(self, entrez_gene_id): - assert type(entrez_gene_id) == int - uniprot_id = set() - - # requests = CacheControlHelper() - # try: - # res = self.mygene_obj.query('entrezgene:' + str(entrez_gene_id), species='human', fields='uniprot', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + entrez_gene_id, file=sys.stderr) - # return uniprot_id - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=entrezgene:" + str(entrez_gene_id) + "&species=human&fields=uniprot" - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None and len(res) > 0: - res_hits = res.get("hits", None) - if res_hits is not None and type(res_hits) == list: - for hit in res_hits: - res_uniprot_id_dict = hit.get("uniprot", None) - if res_uniprot_id_dict is not None: - res_uniprot_id = res_uniprot_id_dict.get("Swiss-Prot", None) - if res_uniprot_id is not None: - if type(res_uniprot_id) == str: - uniprot_id.add(res_uniprot_id) - else: - if type(res_uniprot_id) == list: - for uniprot_id_item in res_uniprot_id: - uniprot_id.add(uniprot_id_item) - return uniprot_id - - def convert_entrez_gene_ID_to_mirbase_ID(self, entrez_gene_id): - assert type(entrez_gene_id) == int - mirbase_id = set() - - # requests = CacheControlHelper() - # try: - # res = self.mygene_obj.query('entrezgene:' + str(entrez_gene_id), species='human', fields='miRBase', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + entrez_gene_id, file=sys.stderr) - # return mirbase_id - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=entrezgene:" + str(entrez_gene_id) + "&species=human&fields=miRBase" - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None and len(res) > 0: - res_hits = res.get("hits", None) - if res_hits is not None and type(res_hits) == list: - for hit in res_hits: - res_mirbase_id = hit.get("miRBase", None) - if res_mirbase_id is not None: - mirbase_id.add(res_mirbase_id) - else: - print("QueryMyGene.convert_entrez_gene_ID_to_mirbase_ID result missing miRBase field where it was expected; Entrez Gene ID: " + - str(entrez_gene_id), file=sys.stderr) - return mirbase_id - - def get_gene_ontology_ids_bp_for_uniprot_id(self, uniprot_id): - assert type(uniprot_id) == str - res = dict() - - # requests = CacheControlHelper() - # try: - # q_res = self.mygene_obj.query('uniprot:' + uniprot_id, species='human', fields='go', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + uniprot_id, file=sys.stderr) - # return res - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=uniprot:" + uniprot_id + "&species=human&fields=go" - q_res = QueryMyGene.__access_api(handler, url_suffix) - - if q_res is None: - return res - - q_res_hits = q_res.get('hits', None) - if q_res_hits is not None: - if type(q_res_hits) == list and len(q_res_hits) > 0: - for q_res_hit in q_res_hits: - if type(q_res_hit) == dict: - q_res_go = q_res_hit.get('go', None) - if q_res_go is not None: - q_res_bp = q_res_go.get('BP', None) - if q_res_bp is not None: - if type(q_res_bp) == list and len(q_res_bp) > 0: - res_add = {item["id"]: item["term"] for item in q_res_bp} - res.update(res_add) - return res - - def get_gene_ontology_ids_for_uniprot_id(self, uniprot_id): - assert type(uniprot_id) == str - res = dict() - - # requests = CacheControlHelper() - # try: - # q_res = self.mygene_obj.query('uniprot:' + uniprot_id, species='human', fields='go', verbose=False) - # except requests.exceptions.HTTPError: - # print("HTTP error in mygene_obj.query for query string: " + uniprot_id, file=sys.stderr) - # return res - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=uniprot:" + uniprot_id + "&species=human&fields=go" - q_res = QueryMyGene.__access_api(handler, url_suffix) - - if q_res is None: - return res - - q_res_hits = q_res.get('hits', None) - if q_res_hits is not None: - if type(q_res_hits) == list and len(q_res_hits) > 0: - for q_res_hit in q_res_hits: - if type(q_res_hit) == dict: - q_res_go = q_res_hit.get('go', None) - if q_res_go is not None: - for ont_name, ont_dict_list in q_res_go.items(): - ont_name_simple_node_type = self.ONT_NAME_TO_SIMPLE_NODE_TYPE[ont_name] - for ont_dict in ont_dict_list: - if type(ont_dict) == dict: - term = ont_dict.get('term', None) - id = ont_dict.get('id', None) - res.update({id: {'term': term, - 'ont': ont_name_simple_node_type}}) - return res - - def get_gene_ontology_ids_bp_for_entrez_gene_id(self, entrez_gene_id): - res = dict() - assert type(entrez_gene_id) == int - # q_res = self.mygene_obj.query('entrezgene:' + str(entrez_gene_id), species='human', fields='go', verbose=False) - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=entrezgene:" + str(entrez_gene_id) + "&species=human&fields=go" - q_res = QueryMyGene.__access_api(handler, url_suffix) - - if q_res is None: - return res - - q_res_hits = q_res.get('hits', None) - if q_res_hits is not None: - if type(q_res_hits) == list and len(q_res_hits) > 0: - for q_res_hit in q_res_hits: - if type(q_res_hit) == dict: - q_res_go = q_res_hit.get('go', None) - if q_res_go is not None: - q_res_bp = q_res_go.get('BP', None) - if q_res_bp is not None: - if type(q_res_bp) == list and len(q_res_bp) > 0: - res_add = {item["id"]: item["term"] for item in q_res_bp} - res.update(res_add) - return res - - def uniprot_id_is_human(self, uniprot_id_str): - # res_json = self.mygene_obj.query("uniprot:" + uniprot_id_str, species="human", verbose=False) - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=uniprot:" + uniprot_id_str + "&species=human" - res_json = QueryMyGene.__access_api(handler, url_suffix) - - if res_json is None: - return False - - hits = res_json.get("hits", None) - return hits is not None and len(hits) > 0 - - def get_cui(self, gene_id): - if gene_id.startswith('NCBIGene'): - gene_id = int(gene_id.split(':')[1]) - # res = self.mygene_obj.getgene(gene_id, fields='umls', verbose=False) - - handler = QueryMyGene.HANDLER_MAP['gene'] + '/' + str(gene_id) - url_suffix = 'fields=umls' - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None: - cui_res = res.get('umls', None) - else: - cui_res = None - cuis = None - if cui_res is not None: - cuis = [cui_res['cui']] - return cuis - elif gene_id.startswith('UniProt'): - uni_id = 'uniprot:' + gene_id.split(':')[1] - # res = self.mygene_obj.query(uni_id, fields='umls', verbose=False) - - handler = QueryMyGene.HANDLER_MAP['query'] - url_suffix = "q=" + uni_id + "&fields=umls" - res = QueryMyGene.__access_api(handler, url_suffix) - - if res is not None: - cuis = [] - if 'hits' in res.keys(): - for hit in res['hits']: - if 'umls' in hit.keys(): - cuis.append(hit['umls']['cui']) - if len(cuis) > 0: - return cuis - else: - return None - return None - - @staticmethod - def get_protein_entity(protein_id): - # mg = mygene.MyGeneInfo() - # results = str(mg.query(protein_id.replace('UniProtKB', 'UniProt'), fields='all', return_raw='True', verbose=False)) - - handler = QueryMyGene.HANDLER_MAP['query'] - # url_suffix = "q=" + protein_id.replace('UniProtKB', 'UniProt') + "&fields=all" - params = {'q': protein_id.replace('UniProtKB', 'UniProt'), 'fields': 'all'} - results = str(QueryMyGene.__access_api(handler, None, params=params, return_raw=True)) - - result_str = 'None' - if len(results) > 100: - json_dict = json.loads(results) - result_str = json.dumps(json_dict) - return result_str - - @staticmethod - def get_microRNA_entity(microrna_id): - # mg = mygene.MyGeneInfo() - # results = str(mg.query(microrna_id.replace('NCBIGene', 'entrezgene'), fields='all', return_raw='True', verbose=False)) - - handler = QueryMyGene.HANDLER_MAP['query'] - # url_suffix = "q=" + microrna_id.replace('NCBIGene', 'entrezgene') + "&fields=all" - params = {'q': microrna_id.replace('NCBIGene', 'entrezgene'), 'fields': 'all'} - results = str(QueryMyGene.__access_api(handler, None, params=params, return_raw=True)) - - result_str = 'None' - if len(results) > 100: - json_dict = json.loads(results) - result_str = json.dumps(json_dict) - return result_str - - def get_protein_desc(self, protein_id): - if not isinstance(protein_id, str): - return "None" - result_str = self.get_protein_entity(protein_id) - desc = "None" - if result_str != "None": - result_dict = json.loads(result_str) - if "hits" in result_dict.keys(): - if len(result_dict["hits"]) > 0: - if "summary" in result_dict["hits"][0].keys(): - desc = result_dict["hits"][0]["summary"] - return desc - - def get_microRNA_desc(self, microrna_id): - if not isinstance(microrna_id, str): - return "None" - result_str = self.get_microRNA_entity(microrna_id) - desc = "None" - if result_str != "None": - result_dict = json.loads(result_str) - if "hits" in result_dict.keys(): - if len(result_dict["hits"]) > 0: - if "summary" in result_dict["hits"][0].keys(): - desc = result_dict["hits"][0]["summary"] - return desc - - - def get_protein_name(self, protein_id): - if not isinstance(protein_id, str): - return "None" - result_str = self.get_protein_entity(protein_id) - name = "None" - if result_str != "None": - result_dict = json.loads(result_str) - if "hits" in result_dict.keys(): - if len(result_dict["hits"]) > 0: - if "name" in result_dict["hits"][0].keys(): - name = result_dict["hits"][0]["name"] - return name - -if __name__ == '__main__': - mg = QueryMyGene() - print(mg.convert_gene_symbol_to_uniprot_id('A2M')) - print(mg.convert_gene_symbol_to_uniprot_id('A1BG')) - print(mg.convert_gene_symbol_to_uniprot_id("HMOX1")) - print(mg.convert_gene_symbol_to_uniprot_id('RAD54B')) - print(mg.convert_gene_symbol_to_uniprot_id('NS2')) - print(mg.convert_uniprot_id_to_gene_symbol("P09601")) - print(mg.convert_uniprot_id_to_gene_symbol('Q05925')) - print(mg.convert_uniprot_id_to_gene_symbol('Q8NBZ7')) - print(mg.convert_uniprot_id_to_entrez_gene_ID("P09601")) - print(mg.convert_uniprot_id_to_entrez_gene_ID("XYZZY")) - print(mg.convert_hgnc_gene_id_to_uniprot_id('HGNC:4944')) - print(mg.convert_hgnc_gene_id_to_uniprot_id('HGNC:49440')) - print(mg.convert_gene_symbol_to_entrez_gene_ID('MIR96')) - print(mg.convert_entrez_gene_id_to_uniprot_id(9837)) - print(mg.convert_entrez_gene_ID_to_mirbase_ID(407053)) - print(mg.get_gene_ontology_ids_for_uniprot_id('Q05925')) - print(mg.get_gene_ontology_ids_bp_for_entrez_gene_id(406991)) - print(mg.uniprot_id_is_human("P02794")) - print(mg.uniprot_id_is_human("P10592")) - print(mg.get_cui("NCBIGene:100847086")) - print(mg.get_cui("UniProtKB:O60884")) - - - def save_to_test_file(filename, key, value): - f = open(filename, 'r+') - try: - json_data = json.load(f) - except ValueError: - json_data = {} - f.seek(0) - f.truncate() - json_data[key] = value - json.dump(json_data, f) - f.close() - - save_to_test_file('tests/query_test_data.json', 'UniProtKB:O60884', mg.get_protein_entity("UniProtKB:O60884")) - save_to_test_file('tests/query_test_data.json', 'NCBIGene:100847086', mg.get_microRNA_entity("NCBIGene:100847086")) - print(mg.get_protein_desc("UniProtKB:O60884")) - print(mg.get_protein_desc("UniProtKB:O608840")) - print(mg.get_microRNA_desc("NCBIGene:100847086")) - print(mg.get_microRNA_desc("NCBIGene:1008470860")) - - print(mg.get_protein_name("UniProtKB:P05231")) - print(mg.get_protein_name("UniProtKB:Q8IW03")) \ No newline at end of file diff --git a/code/reasoningtool/kg-construction/QueryPubChem.py b/code/reasoningtool/kg-construction/QueryPubChem.py deleted file mode 100644 index 06b537300..000000000 --- a/code/reasoningtool/kg-construction/QueryPubChem.py +++ /dev/null @@ -1,225 +0,0 @@ -__author__ = 'Stephen Ramsey' -__copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey', 'Finn Womack'] -__license__ = 'MIT' -__version__ = '0.1.0' -__maintainer__ = '' -__email__ = '' -__status__ = 'Prototype' - -import urllib -import pandas -# import requests -import sys -import time -import math -from io import StringIO -import re -import os -import CachedMethods -# import requests_cache -import json -from cache_control_helper import CacheControlHelper - - -class QueryPubChem: - API_BASE_URL = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' - TIMEOUT_SEC = 120 - HANDLER_MAP = { - 'get_pubchem_cid': 'substance/sid/{sid}/JSON', - 'get_description_url': 'compound/cid/{cid}/description/JSON' - } - - @staticmethod - def __access_api(handler): - requests = CacheControlHelper() - url = QueryPubChem.API_BASE_URL + '/' + handler - # print(url) - try: - res = requests.get(url, timeout=QueryPubChem.TIMEOUT_SEC) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryPubChem for URL: ' + url, file=sys.stderr) - return None - except KeyboardInterrupt: - sys.exit(0) - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryPubChem for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(url, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - return res.json() - - @staticmethod - def send_query_get(handler, url_suffix): - requests = CacheControlHelper() - url = QueryPubChem.API_BASE_URL + '/' + handler + '/' + url_suffix - # print(url) - try: - res = requests.get(url, timeout=QueryPubChem.TIMEOUT_SEC) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryPubChem for URL: ' + url, file=sys.stderr) - return None - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryPubChem for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(url, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - return res.json() - - @staticmethod - def get_chembl_ids_for_drug(drug_name): - drug_name_safe = urllib.parse.quote(drug_name, safe='') - res = QueryPubChem.send_query_get(handler='compound/name', - url_suffix=drug_name_safe + '/synonyms/JSON') - res_chembl_set = set() - if res is not None: - information_list_dict = res.get('InformationList', None) - if information_list_dict is not None: - information_list = information_list_dict.get('Information', None) - if information_list is not None: - for information_dict in information_list: - synonyms = information_dict.get('Synonym', None) - if synonyms is not None: - for syn in synonyms: - if syn.startswith('CHEMBL'): - res_chembl_set.add(syn) - # res_chembl_set.add('ChEMBL:' + syn.replace('CHEMBL', '')) - return res_chembl_set - - # @staticmethod - # def test(): - # print(QueryPubChem.get_chembl_ids_for_drug('gne-493')) - # print(QueryChEMBL.get_target_uniprot_ids_for_drug('clothiapine')) - - @staticmethod - # @CachedMethods.register - def get_pubchem_id_for_chembl_id(chembl_id): - """This takes a chembl id and then looks up the corresponding pubchem id from a pre-generated .tsv - - NOTE: pubchem-chembl mappings .tsv generated using https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi - it took ~3 or so seconds to map all ids in the KG (2226 ids) and not all ids were successful (missed 204 terms -> ~91% success rate) - """ - dir_path = os.path.dirname(os.path.realpath(__file__)) - df = pandas.read_csv(dir_path + '/chemblMap.tsv', sep='\t', index_col=0, header=None) - try: - ans = df.loc[chembl_id].iloc[0] - except KeyError: - return None - if math.isnan(ans): - return None - else: - return str(int(ans)) - - @staticmethod - # @CachedMethods.register - def get_pubmed_id_for_pubchem_id(pubchem_id): - """ - This takes a PubChem id and then gets the PMIDs for articles on PubMed from PubChem which include this entity. - """ - if not isinstance(pubchem_id, str): - return None - - requests = CacheControlHelper() - url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(pubchem_id) + '/xrefs/PubMedID/JSON' - try: - r = requests.get(url, timeout=10) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryPubChem for URL: ' + url, file=sys.stderr) - return None - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryPubChem for URL: %s' % (e, url), file=sys.stderr) - return None - if r is not None: - if 'Fault' in r.json().keys(): - return None - else: - ans = [str(x) + '[uid]' for x in r.json()['InformationList']['Information'][0]['PubMedID']] - return ans - else: - return None - - @staticmethod - def get_pubchem_cid(pubchem_sid): - pubchem_cid = None - if not isinstance(pubchem_sid, str): - return pubchem_cid - handler = QueryPubChem.HANDLER_MAP['get_pubchem_cid'].format(sid=pubchem_sid) - res = QueryPubChem.__access_api(handler) - if res is not None: - if 'PC_Substances' in res.keys(): - substance = res['PC_Substances'][0] - if len(substance) > 0: - if 'compound' in substance.keys(): - compounds = substance['compound'] - if len(compounds) > 1: - compound = compounds[1] - if 'id' in compound.keys(): - obj = compound['id'] - if 'id' in obj.keys(): - id_obj = obj['id'] - if 'cid' in id_obj.keys(): - pubchem_cid = str(id_obj['cid']) - return pubchem_cid - - - @staticmethod - def get_description_url_from_cid(pubchem_cid): - """ query the description URL from HMDB - Args: - pubchem_cid (str): PubChem CID, e.g. 123689 - Returns: - desc_url (str): the URL of HMDB website, which contains the description of the compound - """ - res_url = None - if not isinstance(pubchem_cid, str): - return res_url - handler = QueryPubChem.HANDLER_MAP['get_description_url'].format(cid=pubchem_cid) - res = QueryPubChem.__access_api(handler) - if res is not None: - if 'InformationList' in res.keys(): - info_list = res['InformationList'] - if 'Information' in info_list.keys(): - infos = info_list['Information'] - for info in infos: - if 'DescriptionSourceName' in info.keys() and 'DescriptionURL' in info.keys(): - if info['DescriptionSourceName'] == "Human Metabolome Database (HMDB)": - return info['DescriptionURL'] - return res_url - - - @staticmethod - def get_description_url(pubchem_sid): - res_url = None - if not isinstance(pubchem_sid, str): - return res_url - pubchem_cid = QueryPubChem.get_pubchem_cid(pubchem_sid) - if pubchem_cid is not None: - res_url = QueryPubChem.get_description_url_from_cid(pubchem_cid) - return res_url - -if __name__ == '__main__': - print(QueryPubChem.get_chembl_ids_for_drug('gne-493')) - print(QueryPubChem.get_pubchem_id_for_chembl_id('CHEMBL521')) - print(QueryPubChem.get_pubchem_id_for_chembl_id('chembl521')) - print(QueryPubChem.get_pubchem_id_for_chembl_id('3400')) - print(QueryPubChem.get_pubmed_id_for_pubchem_id('3672')) - print(QueryPubChem.get_pubmed_id_for_pubchem_id('3500')) - print(QueryPubChem.get_pubmed_id_for_pubchem_id('3400')) - print(QueryPubChem.get_description_url('6921')) - print(QueryPubChem.get_description_url('3500')) - print(QueryPubChem.get_description_url('3400')) - print(QueryPubChem.get_description_url(3400)) - print(QueryPubChem.get_description_url('3324')) - diff --git a/code/reasoningtool/kg-construction/QueryUniprot.py b/code/reasoningtool/kg-construction/QueryUniprot.py deleted file mode 100644 index 0b6439e11..000000000 --- a/code/reasoningtool/kg-construction/QueryUniprot.py +++ /dev/null @@ -1,218 +0,0 @@ -""" This module defines the class QueryUniprot which connects to APIs at -http://www.uniprot.org/uploadlists/, querying reactome pathways from uniprot id. - -* map_enzyme_commission_id_to_uniprot_ids(ec_id) - - Description: - map enzyme commission id to UniProt ids - - Args: - ec_id (str): enzyme commission id, e.g., "ec:1.4.1.17" - - Returns: - ids (set): a set of the enzyme commission ids, or empty set if no UniProt id can be obtained or the response - status code is not 200. - -""" - -__author__ = "" -__copyright__ = "" -__credits__ = [] -__license__ = "" -__version__ = "" -__maintainer__ = "" -__email__ = "" -__status__ = "Prototype" - -# import requests -# import requests_cache -from cache_control_helper import CacheControlHelper -import CachedMethods -import sys -import urllib.parse -import xmltodict - - -class QueryUniprot: - API_BASE_URL = "http://www.uniprot.org/uploadlists/" - TIMEOUT_SEC = 120 - HANDLER_MAP = { - 'map_enzyme_commission_id_to_uniprot_ids': 'uniprot/?query=({id})&format=tab&columns=id', - 'get_protein': 'uniprot/{id}.xml' - } - - @staticmethod - @CachedMethods.register - def uniprot_id_to_reactome_pathways(uniprot_id): - """returns a ``set`` of reactome IDs of pathways associated with a given string uniprot ID - - :param uniprot_id: a ``str`` uniprot ID, like ``"P68871"`` - :returns: a ``set`` of string Reactome IDs - """ - - payload = { 'from': 'ACC', - 'to': 'REACTOME_ID', - 'format': 'tab', - 'query': uniprot_id } - contact = "stephen.ramsey@oregonstate.edu" - header = {'User-Agent': 'Python %s' % contact} - - requests = CacheControlHelper() - - try: - url =QueryUniprot.API_BASE_URL - res = requests.post(QueryUniprot.API_BASE_URL, data=payload, headers=header) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryUniprot for URL: ' + QueryUniprot.API_BASE_URL, file=sys.stderr) - return None - except KeyboardInterrupt: - sys.exit(0) - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryUniprot for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(QueryUniprot.API_BASE_URL, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + QueryUniprot.API_BASE_URL, file=sys.stderr) - return None -# assert 200 == res.status_code - res_set = set() - for line in res.text.splitlines(): - field_str = line.split("\t")[1] - if field_str != "To": - res_set.add(field_str) - return res_set - - @staticmethod - def __access_api(handler): - - api_base_url = 'http://www.uniprot.org' - url = api_base_url + '/' + handler - #print(url) - contact = "stephen.ramsey@oregonstate.edu" - header = {'User-Agent': 'Python %s' % contact} - - requests = CacheControlHelper() - try: - res = requests.get(url, timeout=QueryUniprot.TIMEOUT_SEC, headers=header) - except requests.exceptions.Timeout: - print(url, file=sys.stderr) - print('Timeout in QueryUniprot for URL: ' + url, file=sys.stderr) - return None - except requests.exceptions.ChunkedEncodingError: - print(url, file=sys.stderr) - print('ChunkedEncodingError for URL: ' + url, file=sys.stderr) - return None - except BaseException as e: - print(url, file=sys.stderr) - print('%s received in QueryUniprot for URL: %s' % (e, url), file=sys.stderr) - return None - status_code = res.status_code - if status_code != 200: - print(url, file=sys.stderr) - print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - return res.text - - @staticmethod - def map_enzyme_commission_id_to_uniprot_ids(ec_id): - res_set = set() - if not isinstance(ec_id, str): - return res_set - ec_id_encoded = urllib.parse.quote_plus(ec_id) - handler = QueryUniprot.HANDLER_MAP['map_enzyme_commission_id_to_uniprot_ids'].format(id=ec_id_encoded) - res = QueryUniprot.__access_api(handler) - if res is not None: - res = res[res.find('\n')+1:] - for line in res.splitlines(): - res_set.add(line) - return res_set - - @staticmethod - def __get_entity(entity_type, entity_id): - if entity_id[:10] == 'UniProtKB:': - entity_id = entity_id[10:] - handler = QueryUniprot.HANDLER_MAP[entity_type].format(id=entity_id) - results = QueryUniprot.__access_api(handler) - entity = None - if results is not None: - obj = xmltodict.parse(results) - if 'uniprot' in obj.keys(): - if 'entry' in obj['uniprot'].keys(): - entity = obj['uniprot']['entry'] - return entity - - @staticmethod - def get_protein_gene_symbol(entity_id): - ret_symbol = "None" - if not isinstance(entity_id, str): - return ret_symbol - entity_obj = QueryUniprot.__get_entity("get_protein", entity_id) - if entity_obj is not None: - if 'gene' in entity_obj.keys(): - if "name" in entity_obj["gene"].keys(): - gene_name_obj = entity_obj["gene"]["name"] - if not type(gene_name_obj) == list: - gene_name_obj = [gene_name_obj] - for name_dict in gene_name_obj: - # print(name_dict) - if "primary" in name_dict.values() and "#text" in name_dict.keys(): - ret_symbol = name_dict["#text"] - return ret_symbol - - @staticmethod - def __get_name(entity_type, entity_id): - entity_obj = QueryUniprot.__get_entity(entity_type, entity_id) - name = "UNKNOWN" - if entity_obj is not None: - if 'protein' in entity_obj.keys(): - if 'recommendedName' in entity_obj['protein'].keys(): - if 'fullName' in entity_obj['protein']['recommendedName'].keys(): - name = entity_obj['protein']['recommendedName']['fullName'] - if isinstance(name, dict): - name = name['#text'] - return name - - @staticmethod - def get_protein_name(protein_id): - if not isinstance(protein_id, str): - return "UNKNOWN" - return QueryUniprot.__get_name("get_protein", protein_id) - - @staticmethod - def get_citeable_accession_for_accession(accession_number): - res_acc = None - res_tab = QueryUniprot.__access_api("uniprot/" + accession_number + ".tab") - if res_tab is None: - return res_acc - res_lines = res_tab.splitlines() - if len(res_lines) > 1: - res_acc = res_lines[1].split("\t")[0] - return res_acc - -if __name__ == '__main__': - print(QueryUniprot.get_citeable_accession_for_accession("P35354")) - print(QueryUniprot.get_citeable_accession_for_accession("A8K802")) - print(QueryUniprot.get_citeable_accession_for_accession("Q16876")) - # print(QueryUniprot.uniprot_id_to_reactome_pathways("P68871")) - # print(QueryUniprot.uniprot_id_to_reactome_pathways("Q16621")) - # print(QueryUniprot.uniprot_id_to_reactome_pathways("P09601")) - print(CachedMethods.cache_info()) - print(QueryUniprot.map_enzyme_commission_id_to_uniprot_ids("ec:1.4.1.17")) # small results - print(QueryUniprot.map_enzyme_commission_id_to_uniprot_ids("ec:1.3.1.110")) # empty result - print(QueryUniprot.map_enzyme_commission_id_to_uniprot_ids("ec:1.2.1.22")) # large results - print(QueryUniprot.map_enzyme_commission_id_to_uniprot_ids("ec:4.4.1.xx")) # fake id - print(QueryUniprot.map_enzyme_commission_id_to_uniprot_ids("R-HSA-1912422")) # wrong id - print(QueryUniprot.get_protein_gene_symbol('UniProtKB:P20848')) - print(QueryUniprot.get_protein_gene_symbol("UniProtKB:P01358")) - print(QueryUniprot.get_protein_gene_symbol("UniProtKB:Q96P88")) - print(QueryUniprot.get_protein_name('UniProtKB:P01358')) - print(QueryUniprot.get_protein_name('UniProtKB:P20848')) - print(QueryUniprot.get_protein_name('UniProtKB:Q9Y471')) - print(QueryUniprot.get_protein_name('UniProtKB:O60397')) - print(QueryUniprot.get_protein_name('UniProtKB:Q8IZJ3')) - print(QueryUniprot.get_protein_name('UniProtKB:Q7Z2Y8')) - print(QueryUniprot.get_protein_name('UniProtKB:Q8IWN7')) - print(QueryUniprot.get_protein_name('UniProtKB:Q156A1')) diff --git a/code/reasoningtool/kg-construction/SynonymMapper.py b/code/reasoningtool/kg-construction/SynonymMapper.py deleted file mode 100644 index 86ebc7ba5..000000000 --- a/code/reasoningtool/kg-construction/SynonymMapper.py +++ /dev/null @@ -1,172 +0,0 @@ -import sys -import os - -from NormGoogleDistance import NormGoogleDistance -from QueryMyGene import QueryMyGene -import mygene -import requests -from QueryMyChem import QueryMyChem -import requests_cache -import pandas -#import _mysql_exceptions - - -class SynonymMapper(): - - def __init__(self): - self.biothings_url = "http://c.biothings.io/v1/query?q=" - self.mygene_obj = mygene.MyGeneInfo() - self.qmg = QueryMyGene() - - def prot_to_gene(self, curie_id): - """ - This takes a uniprot curie id and converts it into a few different gene ids - """ - if len(curie_id.split(':'))>1: - uniprot_id = curie_id.split(':')[1] - else: - return None - entrez_ids = self.qmg.convert_uniprot_id_to_entrez_gene_ID(uniprot_id) - if entrez_ids is not None: - entrez_ids = set(entrez_ids) - else: - entrez_ids = set() - hgnc_ids = set() - mim_ids = set() - vega_ids = set() - ensembl_ids = set() - synonyms = [] - - symbols = self.qmg.convert_uniprot_id_to_gene_symbol(uniprot_id) - for symbol in symbols: - synonyms += ['HGNC.Symbol:' + symbol] - - for gene_id in entrez_ids: - synonyms += ['NCBIGene:' + str(gene_id)] - try: - res = self.mygene_obj.getgene(int(gene_id), fields = 'HGNC,MIM,Vega,ensembl', verbose = False) - except requests.exceptions.HTTPError: - print('HTTP error for querying uniprot to gene symbol mygene: ' + uniprot_id, file=sys.stderr) - res = None - if res is not None: - hgnc_res = res.get('HGNC', None) - mim_res = res.get('MIM', None) - vega_res = res.get('Vega', None) - ensembl_res = res.get('ensembl', None) - else: - hgnc_res = None - mim_res = None - vega_res = None - ensembl_res = None - if hgnc_res is not None: - hgnc_ids |= set([hgnc_res]) - if mim_res is not None: - mim_ids |= set([mim_res]) - if vega_res is not None: - vega_ids |= set([vega_res]) - if ensembl_res is not None: - if type(ensembl_res) == list: - for ens_res in ensembl_res: - ensembl_gene_res = ens_res.get('gene', None) - if ensembl_gene_res is not None: - ensembl_ids |= set([ensembl_gene_res]) - else: - ensembl_gene_res = ensembl_res.get('gene', None) - if ensembl_gene_res is not None: - ensembl_ids |= set([ensembl_gene_res]) - - for hgnc_id in hgnc_ids: - synonyms += ['HGNC:' + str(hgnc_id)] - for mim_id in mim_ids: - synonyms += ['OMIM:' + str(mim_id)] - for vega_id in vega_ids: - synonyms += ['Vega:' + str(vega_id)] - for ensembl_id in ensembl_ids: - synonyms += ['ensembl:' + str(ensembl_id)] - - if len(synonyms)>0: - return synonyms - else: - return None - - def get_all_from_oxo(self, curie_id, map_to = None): - """ - this takes a curie id and gets all the mappings that oxo has for the given id - - :param curie_id: The string for the curie id to submit to OXO (e.g. 'HP:0001947') - :param map_to: A string containing the prefix for the resulting ids. If set to None it will return all mappings. (default is none) - - :return: A list of strings containing the found mapped ids or None if none where found - """ - if map_to is None: - map_to = '' - if type(curie_id) != str: - curie_id = str(curie_id) - if curie_id.startswith('REACT:'): - curie_id = curie_id.replace('REACT', 'Reactome') - prefix = curie_id.split(':')[0] - res = NormGoogleDistance.query_oxo(curie_id) - synonym_ids=None - if res is not None: - res = res.json() - synonym_ids = set() - n_res = res['page']['totalElements'] - if int(n_res) > 0: - mappings = res['_embedded']['mappings'] - for mapping in mappings: - if type(map_to) == list: - for elm in map_to: - if mapping['fromTerm']['curie'].startswith(prefix): - if mapping['toTerm']['curie'].startswith(elm): - synonym_ids |= set([mapping['toTerm']['curie']]) - elif mapping['toTerm']['curie'].startswith(prefix): - if mapping['fromTerm']['curie'].startswith(elm): - synonym_ids |= set([mapping['fromTerm']['curie']]) - else: - if mapping['fromTerm']['curie'].startswith(prefix): - if mapping['toTerm']['curie'].startswith(map_to): - synonym_ids |= set([mapping['toTerm']['curie']]) - elif mapping['toTerm']['curie'].startswith(prefix): - if mapping['fromTerm']['curie'].startswith(map_to): - synonym_ids |= set([mapping['fromTerm']['curie']]) - if len(synonym_ids) == 0: - synonym_ids = None - else: - synonym_ids = list(synonym_ids) - return synonym_ids - - - def chembl_to_chebi(self, chemical_substance_id): - """ - This takes a chembl curie id and return a chebi curie id - """ - if chemical_substance_id[:7] == "ChEMBL:": - chemical_substance_id = chemical_substance_id.replace("ChEMBL:", "CHEMBL") - if chemical_substance_id.startswith('CHEMBL:CHEMBL'): - chemical_substance_id = chemical_substance_id.replace("CHEMBL:", "") - handler = 'chem/' + chemical_substance_id + '?fields=chebi.chebi_id' - - url = QueryMyChem.API_BASE_URL + '/' + handler - - try: - res = requests.get(url, timeout=QueryMyChem.TIMEOUT_SEC) - except requests.exceptions.Timeout: - #print(url, file=sys.stderr) - #print('Timeout in QueryMyChem for URL: ' + url, file=sys.stderr) - return None - if res is None: - return None - status_code = res.status_code - if status_code != 200: - #print(url, file=sys.stderr) - #print('Status code ' + str(status_code) + ' for url: ' + url, file=sys.stderr) - return None - id_json = res.json() - if 'chebi' in id_json.keys(): - return id_json['chebi']['chebi_id'] - else: - return None - - - - diff --git a/code/reasoningtool/kg-construction/chemblMap.tsv b/code/reasoningtool/kg-construction/chemblMap.tsv deleted file mode 100644 index 1158110c7..000000000 --- a/code/reasoningtool/kg-construction/chemblMap.tsv +++ /dev/nulldiff --git a/code/reasoningtool/kg-construction/request_cache_helper.py b/code/reasoningtool/kg-construction/request_cache_helper.py deleted file mode 100644 index 2011bb7d6..000000000 --- a/code/reasoningtool/kg-construction/request_cache_helper.py +++ /dev/null @@ -1,65 +0,0 @@ -import requests -import requests_cache -import hashlib -import time -import re, os - -_DEFAULT_HEADERS = requests.utils.default_headers() - -#requests_cache.install_cache("orangeboard") -# specifiy the path of orangeboard database -tmppath = re.compile(".*/RTX/") -dbpath = tmppath.search(os.path.realpath(__file__)).group(0) + 'data/orangeboard' -requests_cache.install_cache(dbpath) - -def get_timestamp(url): - """ - get the timestamp of an HTTP get request - :param url: the URL of the request - :return the timestamp of the request, of None if the request is not in the cache - """ - def _to_bytes(s, encoding='utf-8'): - return bytes(s, encoding) - - def create_key(request): - url, body = request.url, request.body - key = hashlib.sha256() - key.update(_to_bytes(request.method.upper())) - key.update(_to_bytes(url)) - if request.body: - key.update(_to_bytes(body)) - return key.hexdigest() - - def url_to_key(url): - session = requests.Session() - return create_key(session.prepare_request(requests.Request('GET', url))) - - # get the cache from request_cache - results = requests_cache.get_cache() - # create the key according to the url - key_url = url_to_key(url) - # results.responses is a dictionary and follows the following format: - # { 'key': (requests_cache.backends objects, timestamp), ..., } - # for example: '4c28e3e4a61e325e520d9c02e0caee99e30c00951a223e67': - # (, - # datetime.datetime(2018, 10, 16, 0, 19, 8, 130204)), - if key_url in results.responses: - back_obj, timestamp = results.responses[key_url] - return timestamp - return None - - -if __name__ == '__main__': - - url = 'http://cohd.io/api/association/obsExpRatio?dataset_id=1&concept_id_1=192855&domain=Procedure' - url1 = 'http://cohd.io/api/association/obsExpRatio?dataset_id=1&concept_id_1=192853&domain=Procedure' - url2 = 'http://cohd.io/api/association/obsExpRatio?dataset_id=1&concept_id_1=192854&domain=Procedure' - - res = requests.get(url) - res = requests.get(url1) - - t = time.time() - print(get_timestamp(url)) - print(get_timestamp(url1)) - print(get_timestamp(url2)) - print("Time used: ", time.time() - t)