use HTML as intermediate format

marph91 · Jan 20, 2025 · 40cc52c · 40cc52c
1 parent 51fc78f
commit 40cc52c
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 27 deletions.
diff --git a/docs/contributing/development_considerations.md b/docs/contributing/development_considerations.md
@@ -82,3 +82,14 @@ graph TD;
     note_extraction -- Pandoc Supported Formats --> Pandoc;
     Pandoc --> Markdown;
 ```
+
+## Intermediate Format
+
+- **HTML**:
+  - Easily modifyable by beautifulsoup and others.
+  - Supports wide range of elements that can be "reduced" to Markdown.
+  - No additional dependency (beautifulsoup is used already).
+- Pandoc AST:
+  - Python: Panflute and pandocfilters aren't up-to-date (problems with tables especially).
+  - Lua: Learning curve, second scripting language in this repo.
+  - General: Some filters need some preprocessing (in HTML), like iframes.
diff --git a/src/formats/rednotebook.py b/src/formats/rednotebook.py
@@ -21,12 +21,12 @@ def handle_markdown_links(self, body: str) -> tuple[str, imf.Resources]:
             # Links are usually enclosed with double quotation marks.
             # They get removed in some cases when parsing. Add them again
             # to get the original string.
-            if not link.url.startswith('""'):
-                link.url = f'""{link.url}""'
+            if not link.url.startswith('%22%22'):
+                link.url = f'%22%22{link.url}%22%22'
             original_link_text = str(link)
 
             # remove double quotation marks
-            link.url = link.url.replace('""', "")
+            link.url = link.url.replace('%22%22', "")
             # remove the "file://" protocol if needed
             parsed_link = urlparse(link.url)
             if parsed_link.scheme == "file":
@@ -35,7 +35,7 @@ def handle_markdown_links(self, body: str) -> tuple[str, imf.Resources]:
             if link.is_web_link or link.is_mail_link:
                 # Resource links get replaced later,
                 # but these links need to be replaced here.
-                body = body.replace(f'""{link.url}""', link.url)
+                body = body.replace(f'%22%22{link.url}%22%22', link.url)
             else:
                 # resource
                 if link.url is None:

diff --git a/src/formats/synology_note_station.py b/src/formats/synology_note_station.py
@@ -138,7 +138,7 @@ def convert_note(self, note_id, note_id_title_map):
             self.logger.debug(f"Ignoring note in trash \"{note['title']}\"")
             return
         title = note["title"]
-        self.logger.debug(f'Converting note "{title}" (ID: {note_id})')
+        self.logger.debug(f'Converting note "{title}" (ID: "{note_id}")')
 
         # resources / attachments
         resources = self.map_resources_by_hash(note)

diff --git a/src/formats/zoho_notebook.py b/src/formats/zoho_notebook.py
@@ -57,7 +57,7 @@ def convert_note(self, file_: Path):
                 metadata[key] = json.loads(value)
 
         title = metadata["data-notecard"]["name"]
-        self.logger.debug(f'Converting note "{title}"')
+        self.logger.debug(f'Converting note "{title}" (ID: "{file_.stem}")')
 
         # get or find parent notebook
         # Assume that notebooks can't be nested.

diff --git a/src/markdown_lib/common.py b/src/markdown_lib/common.py
@@ -263,28 +263,23 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
 # fmt:on
 
 
-def markup_to_markdown(
-    text: bytes | str, format_: str = "html", resource_folder: Path = Path("tmp_media")
-) -> str:
-    # Route everything through this function to get a single path of truth.
-    if format_ == "html":
-        # some needed preprocessing
-        soup = BeautifulSoup(text, "html.parser")
-        markdown_lib.html_preprocessing.div_checklists(soup)
-        markdown_lib.html_preprocessing.highlighting(soup)
-        markdown_lib.html_preprocessing.iframes_to_links(soup)
-        markdown_lib.html_preprocessing.streamline_tables(soup)
-        markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup)
-        markdown_lib.html_preprocessing.whitespace_in_math(soup)
-        text = str(soup)
+def html_to_markdown(text_html: bytes | str):
+    # some needed preprocessing
+    soup = BeautifulSoup(text_html, "html.parser")
+    markdown_lib.html_preprocessing.div_checklists(soup)
+    markdown_lib.html_preprocessing.highlighting(soup)
+    markdown_lib.html_preprocessing.iframes_to_links(soup)
+    markdown_lib.html_preprocessing.streamline_tables(soup)
+    markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup)
+    markdown_lib.html_preprocessing.whitespace_in_math(soup)
+    text_html_filtered = str(soup)
+
+    # writer: json ast -> markdown
     text_md = pypandoc.convert_text(
-        text,
+        text_html_filtered,
         PANDOC_OUTPUT_FORMAT,
-        format=format_,
-        sandbox=True,
+        format="html",
         extra_args=[
-            # somehow the temp folder is needed to create the resources properly
-            f"--extract-media={resource_folder}",
             # don't create artificial line breaks
             "--wrap=none",
         ],
@@ -295,6 +290,31 @@ def markup_to_markdown(
     text_md = text_md.replace("{TEMPORARYNEWLINE}", "<br>")
     return text_md.strip()
 
+def markup_to_markdown(
+    text: bytes | str, format_: str = "html", resource_folder: Path = Path("tmp_media")
+) -> str:
+    # Route everything through this function to get a single path of truth.
+    if format_.startswith("html"):
+        text_html = text
+    else:
+        # reader: x -> HTML
+        text_html = pypandoc.convert_text(
+            text,
+            "html",
+            format=format_,
+            sandbox=True,
+            extra_args=[
+                # somehow the temp folder is needed to create the resources properly
+                f"--extract-media={resource_folder}",
+                # don't create artificial line breaks
+                "--wrap=none",
+            ],
+        )
+
+    # HTML filter: HTML -> filter -> HTML
+    # writer: HTML -> Markdown
+    return html_to_markdown(text_html)
+
 
 # Problem: "//" is part of many URI (between scheme and host).
 # We need to exclude them to prevent unwanted conversions.

diff --git a/src/markdown_lib/html_preprocessing.py b/src/markdown_lib/html_preprocessing.py
@@ -1,4 +1,10 @@
-"""HTML preprocessing functions to prepare for Pandoc conversion."""
+"""
+HTML preprocessing functions to prepare for Pandoc conversion.
+
+Should be used:
+- For format specific conversions.
+- If they can't be expressed in another way.
+"""
 
 import logging
 import string
@@ -146,6 +152,10 @@ def streamline_tables(soup: BeautifulSoup):
         #     if not isinstance(c, str) or c.strip()
         # ]
 
+        # Remove nested tables.
+        for nested_table in table.find_all("table"):
+            nested_table.unwrap()  # TODO: revisit
+
         # Remove all divs, since they cause pandoc to fail converting the table.
         # https://stackoverflow.com/a/32064299/7410886
         tags_to_remove = ["div", "span"]

diff --git a/test/data b/test/data
+158 −19		reference_data/default_format/eml/eml/error_emails/content_transfer_encoding_with_8bits.md
+20 −6		reference_data/default_format/mediawiki/mediawiki/Wikipedia-20250116163800.md
+1 −7		reference_data/default_format/mediawiki/mediawiki/Wikipedia-20250116165027.md
+2 −2		reference_data/default_format/txt2tags/txt2tags-2/sample_creole.md
+8 −16		reference_data/default_format/txt2tags/txt2tags-2/sample_dbk.md
+8 −16		reference_data/default_format/txt2tags/txt2tags-2/sample_man.md
+2 −4		reference_data/default_format/txt2tags/txt2tags-2/sample_rtf.md
+8 −12		reference_data/default_format/txt2tags/txt2tags-2/sample_t2t.md
+1 −2		reference_data/default_format/txt2tags/txt2tags-2/sample_tex.md
+0 −1,272		reference_data/tiddlywiki/test_4/WireGuard - ArchWiki.md
+0 −1,044		test_data/tiddlywiki/test_4/html_folder/WireGuard - ArchWiki.tid