Skip to content

Commit

Permalink
#321 completely reopen the file every time
Browse files Browse the repository at this point in the history
  • Loading branch information
ecwood committed Jul 20, 2023
1 parent dfc2ac3 commit 9ea83b5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 30 deletions.
6 changes: 6 additions & 0 deletions kg2_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@ def close_kg2_jsonlines(nodes_info: tuple, edges_info: tuple,
temp_output_edges_file.close()


def read_json_lines(file_name: str):
file = open(file_name, 'r')
results = jsonlines.Reader(file)
return results.iter(type=dict)


def get_file_last_modified_timestamp(file_name: str):
return time.gmtime(os.path.getmtime(file_name))

Expand Down
48 changes: 18 additions & 30 deletions report_stats_on_json_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,6 @@ def count_orphan_nodes(nodes: list, edges: list):
input_nodes_file_name = args.inputNodesFile
input_edges_file_name = args.inputEdgesFile

input_nodes_file = open(input_nodes_file_name, 'r')
input_edges_file = open(input_edges_file_name, 'r')

nodes = jsonlines.Reader(input_nodes_file)
edges = jsonlines.Reader(input_edges_file)

build_info = dict()

for n in nodes: # search for build info node starting at end
Expand All @@ -222,42 +216,36 @@ def count_orphan_nodes(nodes: list, edges: list):
print("WARNING: 'build' property is missing from the input JSON.", file=sys.stderr)

number_of_nodes = 0
for node in jsonlines.Reader(input_nodes_file):
for node in kg2_util.read_json_lines(input_nodes_file):
number_of_nodes += 1

number_of_edges = 0
for edge in edges:
for edge in kg2_util.read_json_lines(input_edges_file):
number_of_edges += 1

stats = {'_number_of_nodes': number_of_nodes, # underscore is to make sure it sorts to the top of the report
'_number_of_edges': number_of_edges, # underscore is to make sure it sorts to the top of the report
'_report_datetime': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'_build_version': build_info.get('version', ""),
'_build_time': build_info.get('timestamp_utc', ""),
'number_of_nodes_by_curie_prefix': dict(count_nodes_by_curie_prefix(jsonlines.Reader(input_nodes_file).iter(type=dict))),
'number_of_nodes_without_category__by_curie_prefix': dict(count_nodes_by_curie_prefix_given_no_category(jsonlines.Reader(input_nodes_file).iter(type=dict))),
'number_of_nodes_by_category_label': dict(count_nodes_by_category(jsonlines.Reader(input_nodes_file).iter(type=dict))),
'number_of_nodes_by_source': dict(count_nodes_by_source(jsonlines.Reader(input_nodes_file).iter(type=dict))),
'number_of_edges_by_predicate_curie': dict(count_edges_by_predicate_curie(jsonlines.Reader(input_edges_file).iter(type=dict))),
'number_of_edges_by_predicate_type': dict(count_edges_by_predicate_type(jsonlines.Reader(input_edges_file).iter(type=dict))),
'number_of_edges_by_predicate_curie_prefixes': dict(count_edges_by_predicate_curie_prefix(jsonlines.Reader(input_edges_file).iter(type=dict))),
'number_of_predicates_by_predicate_curie_prefixes': dict(count_predicates_by_predicate_curie_prefix(jsonlines.Reader(input_edges_file).iter(type=dict))),
'number_of_edges_by_source': dict(count_edges_by_source(jsonlines.Reader(input_edges_file).iter(type=dict))),
'types_of_pairs_of_curies_for_xrefs': dict(count_types_of_pairs_of_curies_for_xrefs(jsonlines.Reader(input_edges_file).iter(type=dict))),
'types_of_pairs_of_curies_for_equivs': dict(count_types_of_pairs_of_curies_for_equivs(jsonlines.Reader(input_edges_file).iter(type=dict))),
'number_of_nodes_by_source_and_category': dict(count_number_of_nodes_by_source_and_category(jsonlines.Reader(input_nodes_file).iter(type=dict))),
'sources': get_sources(jsonlines.Reader(input_nodes_file).iter(type=dict)),
'number_of_deprecated_nodes': get_deprecated_nodes(jsonlines.Reader(input_nodes_file).iter(type=dict)),
'number_of_excluded_edges': get_excluded_edges(jsonlines.Reader(input_edges_file).iter(type=dict)),
'number_of_orphan_nodes': count_orphan_nodes(jsonlines.Reader(input_nodes_file).iter(type=dict), jsonlines.Reader(input_edges_file).iter(type=dict))}
'number_of_nodes_by_curie_prefix': dict(count_nodes_by_curie_prefix(kg2_util.read_json_lines(input_nodes_file))),
'number_of_nodes_without_category__by_curie_prefix': dict(count_nodes_by_curie_prefix_given_no_category(kg2_util.read_json_lines(input_nodes_file))),
'number_of_nodes_by_category_label': dict(count_nodes_by_category(kg2_util.read_json_lines(input_nodes_file))),
'number_of_nodes_by_source': dict(count_nodes_by_source(kg2_util.read_json_lines(input_nodes_file))),
'number_of_edges_by_predicate_curie': dict(count_edges_by_predicate_curie(kg2_util.read_json_lines(input_edges_file))),
'number_of_edges_by_predicate_type': dict(count_edges_by_predicate_type(kg2_util.read_json_lines(input_edges_file))),
'number_of_edges_by_predicate_curie_prefixes': dict(count_edges_by_predicate_curie_prefix(kg2_util.read_json_lines(input_edges_file))),
'number_of_predicates_by_predicate_curie_prefixes': dict(count_predicates_by_predicate_curie_prefix(kg2_util.read_json_lines(input_edges_file))),
'number_of_edges_by_source': dict(count_edges_by_source(kg2_util.read_json_lines(input_edges_file))),
'types_of_pairs_of_curies_for_xrefs': dict(count_types_of_pairs_of_curies_for_xrefs(kg2_util.read_json_lines(input_edges_file))),
'types_of_pairs_of_curies_for_equivs': dict(count_types_of_pairs_of_curies_for_equivs(kg2_util.read_json_lines(input_edges_file))),
'number_of_nodes_by_source_and_category': dict(count_number_of_nodes_by_source_and_category(kg2_util.read_json_lines(input_nodes_file))),
'sources': get_sources(kg2_util.read_json_lines(input_nodes_file)),
'number_of_deprecated_nodes': get_deprecated_nodes(kg2_util.read_json_lines(input_nodes_file)),
'number_of_excluded_edges': get_excluded_edges(kg2_util.read_json_lines(input_edges_file)),
'number_of_orphan_nodes': count_orphan_nodes(kg2_util.read_json_lines(input_nodes_file), kg2_util.read_json_lines(input_edges_file))}

temp_output_file = tempfile.mkstemp(prefix='kg2-')[1]
with open(temp_output_file, 'w') as outfile:
json.dump(stats, outfile, indent=4)
shutil.move(temp_output_file, args.outputFile)

nodes.close()
edges.close()

input_nodes_file.close()
input_edges_file.close()

0 comments on commit 9ea83b5

Please sign in to comment.