diff --git a/src/markdown_lib/common.py b/src/markdown_lib/common.py index 2b61d10..d161fae 100644 --- a/src/markdown_lib/common.py +++ b/src/markdown_lib/common.py @@ -237,6 +237,7 @@ def markup_to_markdown(text: str, format_: str = "html") -> str: # some needed preprocessing soup = BeautifulSoup(text, "html.parser") markdown_lib.html_preprocessing.div_checklists(soup) + markdown_lib.html_preprocessing.handle_newlines_in_math(soup) markdown_lib.html_preprocessing.iframes_to_links(soup) markdown_lib.html_preprocessing.streamline_tables(soup) markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup) diff --git a/src/markdown_lib/html_preprocessing.py b/src/markdown_lib/html_preprocessing.py index 644274e..0dd2aa4 100644 --- a/src/markdown_lib/html_preprocessing.py +++ b/src/markdown_lib/html_preprocessing.py @@ -1,7 +1,12 @@ """HTML preprocessing functions to prepare for Pandoc conversion.""" +import logging +import string + from bs4 import BeautifulSoup +LOGGER = logging.getLogger("jimmy") + def div_checklists(soup: BeautifulSoup): """Convert div checklists to plain HTML checklists.""" @@ -18,6 +23,20 @@ def div_checklists(soup: BeautifulSoup): child.name = "li" +def handle_newlines_in_math(soup: BeautifulSoup): + """ + - Escape unescaped newlines inside tex math blocks. + - Strip trailing (escaped) whitespace. + """ + for annotation in soup.find_all("annotation"): + if (encoding := annotation.attrs.get("encoding")) != "application/x-tex": + LOGGER.debug(f'Unsupported annotation encoding "{encoding}"') + continue + annotation.string = annotation.string.rstrip("\\" + string.whitespace).replace( + "\n\n", "\n\\\\\n" + ) + + def iframes_to_links(soup: BeautifulSoup): """Convert iframes to simple links.""" for iframe in soup.find_all("iframe"): diff --git a/test/data b/test/data index e776c4b..c298ea4 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit e776c4beb4659cc7ce46b539f4efde7b4335a8b1 +Subproject commit c298ea4e5d371497b01b37faf2483b3e5393f299