Skip to content

Commit

Permalink
Alternative symbol" synonyms & "Formerly
Browse files Browse the repository at this point in the history
- In previous commits, stated that this would also include 'included' symbols, but for now we have decided not to proceed with that.
- Update: Now adding synonyms for alternative symbols, w/ type of mondo#abbreviation.
- Update: Now stripping the text ', INCLUDED' from symbols. Previously was only doing for titles.
- Update: For included & alternative titles/symbols ending in ', FORMERLY', these are being added as relatedSynonyms, and also being marked as owl:deprecated.
- Misc updates: Updated some comments and codestyle.
  • Loading branch information
joeflack4 committed Sep 15, 2024
1 parent 48fc895 commit 9ef0d72
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 94 deletions.
128 changes: 72 additions & 56 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
Resources
- https://monarch-initiative.github.io/monarch-ingest/Sources/OMIM/
FYIs
"Included Title(s)" in mimTitles.txt is the same as the "Other entities represented in this entry" section in omim.org
entry pages.
Steps
- Loads prefixes
- Parses mimTitles.txt
Expand Down Expand Up @@ -52,7 +56,8 @@
from rdflib.term import Identifier

from omim2obo.namespaces import *
from omim2obo.parsers.omim_entry_parser import parse_alt_and_included_titles, get_pubs, \
from omim2obo.parsers.omim_entry_parser import clean_alt_and_included_titles, separate_former_titles_and_symbols, \
parse_title_symbol_pairs, get_pubs, \
get_mapped_ids, LabelCleaner
from omim2obo.config import ROOT_DIR, GLOBAL_TERMS
from omim2obo.parsers.omim_txt_parser import *
Expand All @@ -79,6 +84,30 @@ def get_curie_maps():
return maps


def add_axiom_annotations(
graph: Graph, source: URIRef, prop: URIRef, target: Union[Literal, str, URIRef],
anno_pred_vals: List[Tuple[URIRef, Union[Literal, str, URIRef]]]
):
"""Add an axion annotation to the graph."""
axiom = BNode()
graph.add((axiom, RDF.type, OWL.Axiom))
graph.add((axiom, OWL.annotatedSource, source))
graph.add((axiom, OWL.annotatedProperty, prop))
graph.add((axiom, OWL.annotatedTarget, target))
for pred, val in anno_pred_vals:
graph.add((axiom, pred, val))


def add_triple_and_optional_annotations(
graph: Graph, source: URIRef, prop: URIRef, target: Union[Literal, str, URIRef],
anno_pred_vals: List[Tuple[URIRef, Union[Literal, str, URIRef]]] = None
):
"""Add a triple and optional annotations to the graph."""
graph.add((source, prop, target))
if anno_pred_vals:
add_axiom_annotations(graph, source, prop, target, anno_pred_vals)


# Classes
class DeterministicBNode(BNode):
"""Overrides BNode to create a deterministic ID"""
Expand Down Expand Up @@ -164,29 +193,28 @@ def omim2obo(use_cache: bool = False):
continue

# - Non-deprecated
# Parse titles
# Parse titles & symbols
omim_type, pref_titles_str, alt_titles_str, inc_titles_str = omim_type_and_titles[omim_id]
alt_titles: List[str] = []
alt_symbols: List[str] = []
alt_title_endswith_included = False
former_alt_titles: List[str] = []
former_alt_symbols: List[str] = []
included_titles: List[str] = []
included_symbols: List[str] = []
included_title_endswith_included = False

pref_titles: List[str] = [x.strip() for x in pref_titles_str.split(';')]
pref_title: str = pref_titles[0]
pref_symbols: List[str] = pref_titles[1:]
# TODO: separate symbols from titles (2x)
# - do this in the func itself
# TODO: Refactor this redundant code block?
# TODO: finally: I think parse_alt_and_included_labels() might be problematic. It returns this bool if case in
# any of the titles, but doesn't say which one
if alt_titles_str:
alt_titles, alt_symbols, alt_title_endswith_included = \
parse_alt_and_included_titles(alt_titles_str)
alt_titles, alt_symbols = parse_title_symbol_pairs(alt_titles_str)
alt_titles, alt_symbols, former_alt_titles, former_alt_symbols = \
separate_former_titles_and_symbols(alt_titles, alt_symbols)
alt_titles, alt_symbols = clean_alt_and_included_titles(alt_titles, alt_symbols)
former_alt_titles, former_alt_symbols = clean_alt_and_included_titles(former_alt_titles, former_alt_symbols)
if inc_titles_str:
included_titles, included_symbols, included_title_endswith_included = \
parse_alt_and_included_titles(inc_titles_str)
included_titles, included_symbols = parse_title_symbol_pairs(inc_titles_str)
included_titles, included_symbols = clean_alt_and_included_titles(included_titles, included_symbols)
included_is_included = included_titles or included_symbols # redundant. can't be included symbol w/out title

# Special cases depending on OMIM term type
is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE
Expand All @@ -206,7 +234,7 @@ def omim2obo(use_cache: bool = False):
gene_label_err = 'Warning: Only 1 symbol picked for label for gene term, but there were 2 to choose' \
f'from. Unsure which is best. Picking the first.\nhttps://omim.org/entry/{omim_id} - {pref_symbols}'
if len(pref_symbols) > 1:
LOG.warning(gene_label_err) # todo: decide the best way to handle these situations
LOG.warning(gene_label_err) # todo: rare (n=1?), but decide the best way to handle these situations
graph.add((omim_uri, RDFS.label, Literal(pref_symbols[0])))
else:
graph.add((omim_uri, RDFS.label, Literal(label_cleaner.clean(pref_title))))
Expand All @@ -217,50 +245,38 @@ def omim2obo(use_cache: bool = False):
pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0]

# Add synonyms
# - exact titles
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(pref_title, pref_abbrev))))
for alt_title in alt_titles:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(alt_title, pref_abbrev))))
# TODO: add abbrevs for all types. this good now? just check, then remove theis temp code
i = 0
for abbrevs in [pref_symbols, alt_symbols, included_symbols]:
i += 1
if i == 2 and alt_symbols:
print() # TODO: make sure at least one case
if i == 3 and included_symbols:
print() # TODO: make sure at least one case
for title in alt_titles:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(title, pref_abbrev))))
# - related titles
for title in former_alt_titles:
graph.add((omim_uri, oboInOwl.hasRelatedSynonym, Literal(label_cleaner.clean(title, pref_abbrev))))
# - exact abbreviations
# todo #1: Consider included_symbols for 'exact' list https://github.com/monarch-initiative/omim/issues/140
for abbrevs in [pref_symbols, alt_symbols]:
for abbreviation in abbrevs:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(abbreviation)))
# Reify on abbreviations. See: https://github.com/monarch-initiative/omim/issues/2
axiom = BNode()
graph.add((axiom, RDF.type, OWL.Axiom))
graph.add((axiom, OWL.annotatedSource, omim_uri))
graph.add((axiom, OWL.annotatedProperty, oboInOwl.hasExactSynonym))
graph.add((axiom, OWL.annotatedTarget, Literal(abbreviation)))
graph.add((axiom, OBOINOWL.hasSynonymType, MONDONS.abbreviation))

# Add 'included' entry properties
included_detected_comment = "This term has one or more labels that end with ', INCLUDED'."
if alt_title_endswith_included or included_title_endswith_included:
graph.add((omim_uri, RDFS['comment'], Literal(included_detected_comment)))
# TODO: are these correct? Do all such labels in inc_labels and alt_labels end with 'INCLUDED'? Or just 1 of
# them, given this boolean? there probably is only 1 such title, and otherwise are symbols. so need to refactor
# should not be iterating here. symbols should be added elsewhere
# - If #1 and #2 never happen, then the _parse*() func shouldn't return this bool, or we shouldn't use it.
# And if we don't use it, then if we only set titles = parse*(), does the boolean get tacked on there? if
# not, then remove its assignment.
for alt_title in alt_titles:
# TODO: #1 Check: do alt titles really ever end with text 'included'? if not, remove this whole variable
if alt_title_endswith_included:
graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(alt_title, pref_abbrev))))
# TODO: Don't we want to add synonym otherwise?
# TODO: Ref issue here if exists, else make, and then convert to lowercase todo
else:
print()
# graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(alt_title, pref_abbrev))))
for included_title in included_titles:
if not included_title_endswith_included: # #2 TODO: this shouldn't happen. check
print()
graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(included_title, pref_abbrev))))
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasExactSynonym, Literal(abbreviation),
[(OBOINOWL.hasSynonymType, MONDONS.abbreviation)])
# - related, deprecated 'former' titles
for title in former_alt_titles:
clean_title = Literal(label_cleaner.clean(title, pref_abbrev))
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title,
[(OWL.deprecated, Literal(True))])
# - related, deprecated 'former' abbreviations
for abbreviation in former_alt_symbols:
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, abbreviation,
[(OWL.deprecated, Literal(True)), (OBOINOWL.hasSynonymType, MONDONS.abbreviation)])

# 'Included' entries
included_comment = "This term has one or more labels that end with ', INCLUDED'."
if included_is_included:
graph.add((omim_uri, RDFS['comment'], Literal(included_comment)))
for title in included_titles:
graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(title, pref_abbrev))))
# todo #1: Consider adding included_symbols https://github.com/monarch-initiative/omim/issues/140
# for symbol in included_symbols:
# graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(symbol)))

# Gene ID
# Why is 'skos:exactMatch' appropriate for disease::gene relationships? - joeflack4 2022/06/06
Expand Down
93 changes: 55 additions & 38 deletions omim2obo/parsers/omim_entry_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def transform_entry(entry) -> Graph:
omim_uri = URIRef(OMIM[omim_num])
other_labels = []
if 'alternativeTitles' in titles:
cleaned, label_endswith_included = parse_alt_and_included_titles(titles['alternativeTitles'])
cleaned, label_endswith_included = parse_title_symbol_pairs(titles['alternativeTitles'])
other_labels += cleaned
if 'includedTitles' in titles:
cleaned, label_endswith_included = parse_alt_and_included_titles(titles['includedTitles'])
cleaned, label_endswith_included = parse_title_symbol_pairs(titles['includedTitles'])
other_labels += cleaned

graph.add((omim_uri, RDF.type, OWL.Class))
Expand Down Expand Up @@ -165,25 +165,26 @@ def _detect_abbreviations(
return replacements


# todo: rename? It's doing more than cleaning; it's mutating
# todo: This step should no longer be necessary as it is now done beforehand: "remove the abbreviation suffixes"
# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
def cleanup_label(
label: str,
explicit_abbrev: str = None,
replacement_case_method: str = 'lower', # lower | title | upper
replacement_case_method_acronyms = 'upper', # lower | title | upper
conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
little_preps: List[str] = [
'at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
articles: List[str] = ['a', 'an', 'the'],
CAPITALIZATION_THRESHOLD = 0.75,
word_replacements: Dict[str, str] = None # w/ known cols
label: str,
explicit_abbrev: str = None,
replacement_case_method: str = 'lower', # lower | title | upper
replacement_case_method_acronyms: str = 'upper', # lower | title | upper
conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
little_preps: List[str] = [
'at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
articles: List[str] = ['a', 'an', 'the'],
CAPITALIZATION_THRESHOLD = 0.75,
word_replacements: Dict[str, str] = None # w/ known cols
) -> str:
"""
Reformat the ALL CAPS OMIM labels to something more pleasant to read.
"""Reformat the ALL CAPS OMIM labels to something more pleasant to read.
This will:
1. remove the abbreviation suffixes
2. convert the roman numerals to integer numbers
2. convert the roman numerals to arabic
3. make the text title case,
except for suplied conjunctions/prepositions/articles
Expand Down Expand Up @@ -269,36 +270,52 @@ def cleanup_label(
return formatted_label


# TODO: get symbols
def parse_alt_and_included_titles(titles: str) -> Tuple[List[str], List[str], bool]:
"""Parse delimited titles/symbol pairs from string to list, and detect any 'included' cases.
def remove_included_and_formerly_suffixes(title: str) -> str:
"""Remove ', INCLUDED' and ', FORMERLY' suffixes from a title"""
for suffix in ['FORMERLY', 'INCLUDED']:
title = re.sub(r',\s*' + suffix, '', title, re.IGNORECASE)
return title


def separate_former_titles_and_symbols(
titles: List[str], symbols: List[str]
) -> Tuple[List[str], List[str], List[str], List[str]]:
"""Separate current title/symbols from deprecated (marked 'former') ones"""
former_titles = [x for x in titles if ', FORMERLY' in x.upper()]
former_symbols = [x for x in symbols if ', FORMERLY' in x.upper()]
current_titles = [x for x in titles if ', FORMERLY' not in x.upper()]
current_symbols = [x for x in symbols if ', FORMERLY' not in x.upper()]
return current_titles, current_symbols, former_titles, former_symbols

This assumes that the titles are double-semicolon (';;') delimited. This will additionally pass each through the
_cleanup_label() method to convert the screaming ALL CAPS to something more pleasant to read.

:param titles: a string of 1+ pairs of symbol/titles, 1 title and and 0-2+ symbols per pair, e.g.:
def clean_alt_and_included_titles(titles: List[str], symbols: List[str]) -> Tuple[List[str], List[str]]:
"""Remove ', INCLUDED' and ', FORMERLY' suffixes from titles/symbols & misc title reformatting"""
# remove ', included' and ', formerly', if present
titles2 = [remove_included_and_formerly_suffixes(x) for x in titles]
symbols2 = [remove_included_and_formerly_suffixes(x) for x in symbols]
# additional reformatting for titles
titles2 = [cleanup_label(x) for x in titles2]
return titles2, symbols2


def parse_title_symbol_pairs(title_symbol_pairs_str: str) -> Tuple[List[str], List[str]]:
"""Separate string of delimited titles/symbol pairs into lists of titles and symbols
:param title_symbol_pairs_str: a string of 1+ pairs of symbol/titles, delimited by ;;, 1 title and and 0-2+ symbols
per pair, delimited by ;, e.g.:
Alternative Title(s); symbol(s):
ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;; ACS V;; NOACK SYNDROME
Included Title(s); symbols:
CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED
:return:
List[str]: cleaned-up titles
List[str]: symbols
bool: whether any of the labels ended with 'included'
"""
labels = []
label_endswith_included = False
for title in titles.split(';;'):
# remove ', included', if present
title = title.strip()
label = re.sub(r',\s*INCLUDED', '', title, re.IGNORECASE)
label_endswith_included = label != title
# TODO: Only use this on titles, not symbols
label = cleanup_label(label)
labels.append(label)

return labels, label_endswith_included
titles: List[str] = []
symbols: List[str] = []
title_symbol_pairs: List[str] = title_symbol_pairs_str.split(';;')
for pair_str in title_symbol_pairs:
pair: List[str] = [x.strip() for x in pair_str.split(';')]
titles.append(pair[0])
symbols.extend(pair[1:])
return titles, symbols


def get_mapped_gene_ids(entry) -> List[str]:
Expand Down

0 comments on commit 9ef0d72

Please sign in to comment.