Skip to content

Commit

Permalink
Convert SSSOM TSV => JSON => TSV and confirm the MappingSetDataFrame …
Browse files Browse the repository at this point in the history
…remains consistent.
  • Loading branch information
hrshdhgd committed Sep 26, 2023
1 parent 2eba644 commit 831a58e
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 9 deletions.
43 changes: 36 additions & 7 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,21 +212,28 @@ def parse_sssom_table(
logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
sssom_metadata[k] = v
meta = sssom_metadata

if "curie_map" in sssom_metadata:
if CURIE_MAP in sssom_metadata:
if prefix_map:
# Convert sssom_metadata[CURIE_MAP] keys to lowercase for case-insensitive comparison
curie_map_lower = {k.lower(): k for k in sssom_metadata[CURIE_MAP].keys()}

for k, v in prefix_map.items():
if k in sssom_metadata[CURIE_MAP]:
if sssom_metadata[CURIE_MAP][k] != v:
k_lower = k.lower()

if k_lower in curie_map_lower:
original_key = curie_map_lower[k_lower]
if sssom_metadata[CURIE_MAP][original_key] != v:
logging.warning(
f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
f"conflicts with provided ({prefix_map[k]})."
f"SSSOM prefix map {original_key} ({sssom_metadata[CURIE_MAP][original_key]}) "
f"conflicts with provided ({v})."
)
sssom_metadata[CURIE_MAP][original_key] = v
else:
logging.info(
f"Externally provided metadata {k}:{v} is added to metadata set."
)
sssom_metadata[CURIE_MAP][k] = v

prefix_map = sssom_metadata[CURIE_MAP]

meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
Expand Down Expand Up @@ -264,10 +271,32 @@ def parse_sssom_json(
) -> MappingSetDataFrame:
"""Parse a TSV to a :class:`MappingSetDocument` to a :class`MappingSetDataFrame`."""
raise_for_bad_path(file_path)
metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

with open(file_path) as json_file:
jsondoc = json.load(json_file)

# Get prefix map from jsondoc and update metadata.
# This takes priority over default prefix_map in case of a tie.
jsondoc_prefix_map = jsondoc["@context"]

# Convert keys in both maps to lower case for comparison
if prefix_map:
lowercase_prefix_map = {k.lower(): v for k, v in prefix_map.items()}

# Iterate over jsondoc_prefix_map
for key, value in jsondoc_prefix_map.items():
# If lowercase key exists in lowercase_prefix_map, update the value and key
if key.lower() in lowercase_prefix_map:
# Remove the old key-value pair
if key in prefix_map:
del prefix_map[key]
elif key.lower() in prefix_map:
del prefix_map[key.lower()]
# Add the new key-value pair
prefix_map[key] = value

metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=metadata.prefix_map, meta=metadata.metadata)
# df: pd.DataFrame = msdf.df
# if mapping_predicates and not df.empty():
Expand Down
8 changes: 7 additions & 1 deletion src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,13 @@ def clean_prefix_map(self, strict: bool = True) -> None:
if self.metadata:
prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata))

missing_prefixes = prefixes_in_table - self.converter.get_prefixes()
# Convert prefixes_in_table to lowercase
prefixes_in_table_lower = {prefix.lower() for prefix in prefixes_in_table}
# Convert self.converter.get_prefixes() to lowercase
converter_prefixes_lower = {prefix.lower() for prefix in self.converter.get_prefixes()}
missing_prefixes = prefixes_in_table_lower - converter_prefixes_lower
# missing_prefixes = prefixes_in_table - self.converter.get_prefixes()

if missing_prefixes and strict:
raise ValueError(
f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map"
Expand Down
8 changes: 8 additions & 0 deletions tests/data/sample1.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#curie_map:
# FBbt: "http://purl.obolibrary.org/obo/FBbt_"
# ORCID: "https://orcid.org/"
# UBERON: "http://purl.obolibrary.org/obo/UBERON_"
#creator_id:
# - "ORCID:0000-0002-6095-8718"
subject_id subject_label predicate_id object_id mapping_justification
FBbt:00000001 organism semapv:crossSpeciesExactMatch UBERON:0000468 semapv:ManualMappingCuration
20 changes: 19 additions & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@
from_sssom_dataframe,
from_sssom_json,
from_sssom_rdf,
parse_sssom_json,
parse_sssom_table,
)
from sssom.typehints import Metadata
from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns
from sssom.writers import write_table
from sssom.writers import to_json, write_json, write_table
from tests.test_data import data_dir as test_data_dir
from tests.test_data import test_out_dir

Expand Down Expand Up @@ -245,3 +246,20 @@ def test_parse_obographs_merged(self):
)
msdf = parse_sssom_table(outfile)
self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())

def test_tsv_to_json_and_back(self):
"""Test converting SSSOM TSV => JSON => SSSOM TSV such that it is reproducible."""
sample_tsv = f"{test_data_dir}/sample1.sssom.tsv"
json_outfile = f"{test_out_dir}/sample1.json"
msdf1 = parse_sssom_table(sample_tsv)
msdf1.clean_prefix_map()
json_doc = to_json(msdf1)

self.assertEqual(msdf1.prefix_map, json_doc["@context"])

with open(json_outfile, "w") as file:
write_json(msdf1, file)

msdf2 = parse_sssom_json(json_outfile)
msdf2.clean_prefix_map()
self.assertEqual(msdf1.prefix_map, msdf2.prefix_map)

0 comments on commit 831a58e

Please sign in to comment.