Skip to content

Commit

Permalink
CU-86948wv58: Fix CSV output relative start and end for annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
mart-r committed Apr 9, 2024
1 parent 77012ea commit 1729995
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 0 deletions.
7 changes: 7 additions & 0 deletions medcat/compare_models/compare_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,13 @@ def _get_text(self, raw_text: str, span_char_limit: Optional[int],
min_char_nr = max(min(start1, start2) - span_char_limit, 0)
max_char_nr = min(max(end1, end2) + span_char_limit, len(raw_text) + 1)
text = raw_text[min_char_nr: max_char_nr]
# update start and end chars so that they match the new text
if ann1:
ann1['start'], ann1['end'] = start1 - min_char_nr, end1 - min_char_nr
ann1['start-raw'], ann1['end-raw'] = start1, end1
if ann2:
ann2['start'], ann2['end'] = start2 - min_char_nr, end2 - min_char_nr
ann2['start-raw'], ann2['end-raw'] = start2, end2
return text

def _to_raw(self, docs: Set[str],
Expand Down
24 changes: 24 additions & 0 deletions medcat/compare_models/tests/test_compare_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,3 +786,27 @@ def test_can_recreate_dicts(self):
df = pd.read_csv(self.file)
self.assert_can_recreate_dicts(df, "ann1")
self.assert_can_recreate_dicts(df, "ann2")

def assert_annotations_remain_same(self, df: pd.DataFrame, column: str,
expected = list):
expected = [value for part in expected for value in part["entities"].values()]
series = df[column]
anns = [v for _, v in series[series.notnull()].items() if v == v]
anns = [eval(v) for v in anns]
# remove raw starts (additions)
# NOTE: this only works so far since the span is greater
# than the document length.
# Otherwise, I'd need to read the `-raw` parts and
# write them to the correspoding spot
for v in anns:
del v['start-raw']
del v['end-raw']
self.assertEqual(len(anns), len(expected))
for nr, (got, expect) in enumerate(zip(anns, expected)):
with self.subTest(f"Nr: {nr}"):
self.assertEqual(got, expect)

def test_annotations_remain_same(self):
df = pd.read_csv(self.file)
self.assert_annotations_remain_same(df, 'ann1', self.annotations1)
self.assert_annotations_remain_same(df, 'ann2', self.annotations2)

0 comments on commit 1729995

Please sign in to comment.