From de5d0a57fc2bd81fff86eaf51c1bc9a5a949ca50 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 27 Jul 2023 15:18:49 -0700 Subject: [PATCH] #321 adding some documentation to query kegg and clearing path between query kegg and kegg conversion --- kegg_json_to_kg_jsonl.py | 25 ++++++++++++++----------- query_kegg.py | 4 ++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/kegg_json_to_kg_jsonl.py b/kegg_json_to_kg_jsonl.py index 3277c6f7..481858b3 100755 --- a/kegg_json_to_kg_jsonl.py +++ b/kegg_json_to_kg_jsonl.py @@ -463,13 +463,17 @@ def process_enzyme(enzyme_dict, kegg_id, nodes_output, edges_output, update_date edges_output.write(format_kegg_edge(node_id, pathway, update_date)) -def make_kg2_graph(kegg, nodes_output, edges_output, update_date): - version_number = kegg['info']['version'] - version_date = kegg['info']['update_date'] - for kegg_id in kegg: +def make_kg2_graph(input_kegg, nodes_output, edges_output, update_date): + version_number = "TEMP" + version_date = "TEMP" + for kegg_input_dict in input_kegg: + for single_item in kegg_input_dict: + kegg_id = single_item if kegg_id == 'info': + version_number = kegg_input_dict[kegg_id]['version'] + version_date = kegg_input_dict[kegg_id]['update_date'] continue - kegg_dict = kegg[kegg_id] + kegg_dict = kegg_input_dict[kegg_id] if KEGG_COMPOUND_PREFIX.match(kegg_id) is not None: process_compound(kegg_dict, kegg_id, nodes_output, edges_output, update_date) @@ -505,17 +509,16 @@ def make_kg2_graph(kegg, nodes_output, edges_output, update_date): output_edges_file_name = args.outputEdgesFile test_mode = args.test + input_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) + input_kegg = input_jsonlines_info[0] + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) nodes_output = nodes_info[0] edges_output = edges_info[0] - kegg = dict() - with open(input_file_name, 'r') as kegg_file: - update_date = kg2_util.convert_date(os.path.getmtime(input_file_name)) - kegg = json.load(kegg_file) - - make_kg2_graph(kegg, nodes_output, edges_output, update_date) + make_kg2_graph(input_kegg, nodes_output, edges_output, update_date) + kg2_util.end_read_jsonlines(input_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) print("Finish time: ", date()) diff --git a/query_kegg.py b/query_kegg.py index fd7fec8c..bd9552bc 100755 --- a/query_kegg.py +++ b/query_kegg.py @@ -105,10 +105,12 @@ def create_query_lists(kegg_id_dict, num_threads): def create_threads(num_threads, output_writer): kegg_id_dict, info_dict = preliminary_queries() + output_writer.write({"info": info_dict}) query_lists = create_query_lists(kegg_id_dict, num_threads) threads = list() print("Number of queriers: ", len(query_lists)) + print("Starting at", kg2_util.date()) for kegg_querier, query_dict in query_lists: print(kegg_querier.name + ": " + str(len(query_dict))) thread = threading.Thread(target=kegg_querier.run_set_of_queries, args=(query_dict,)) @@ -169,6 +171,8 @@ def run_set_of_queries(self, kegg_id_dict): for kegg_id in kegg_id_dict: previous_line_starter = '' + + # If we have a connection issue (which will cause a parsing error), spin until it works, but put a note in the log while True: try: results = send_query(get_base_query + kegg_id)