Skip to content

Commit

Permalink
use HTML as intermediate format
Browse files Browse the repository at this point in the history
  • Loading branch information
marph91 committed Jan 20, 2025
1 parent 51fc78f commit 40cc52c
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 27 deletions.
11 changes: 11 additions & 0 deletions docs/contributing/development_considerations.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,14 @@ graph TD;
note_extraction -- Pandoc Supported Formats --> Pandoc;
Pandoc --> Markdown;
```

## Intermediate Format

- **HTML**:
- Easily modifyable by beautifulsoup and others.
- Supports wide range of elements that can be "reduced" to Markdown.
- No additional dependency (beautifulsoup is used already).
- Pandoc AST:
- Python: Panflute and pandocfilters aren't up-to-date (problems with tables especially).
- Lua: Learning curve, second scripting language in this repo.
- General: Some filters need some preprocessing (in HTML), like iframes.
8 changes: 4 additions & 4 deletions src/formats/rednotebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def handle_markdown_links(self, body: str) -> tuple[str, imf.Resources]:
# Links are usually enclosed with double quotation marks.
# They get removed in some cases when parsing. Add them again
# to get the original string.
if not link.url.startswith('""'):
link.url = f'""{link.url}""'
if not link.url.startswith('%22%22'):
link.url = f'%22%22{link.url}%22%22'
original_link_text = str(link)

# remove double quotation marks
link.url = link.url.replace('""', "")
link.url = link.url.replace('%22%22', "")
# remove the "file://" protocol if needed
parsed_link = urlparse(link.url)
if parsed_link.scheme == "file":
Expand All @@ -35,7 +35,7 @@ def handle_markdown_links(self, body: str) -> tuple[str, imf.Resources]:
if link.is_web_link or link.is_mail_link:
# Resource links get replaced later,
# but these links need to be replaced here.
body = body.replace(f'""{link.url}""', link.url)
body = body.replace(f'%22%22{link.url}%22%22', link.url)
else:
# resource
if link.url is None:
Expand Down
2 changes: 1 addition & 1 deletion src/formats/synology_note_station.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def convert_note(self, note_id, note_id_title_map):
self.logger.debug(f"Ignoring note in trash \"{note['title']}\"")
return
title = note["title"]
self.logger.debug(f'Converting note "{title}" (ID: {note_id})')
self.logger.debug(f'Converting note "{title}" (ID: "{note_id}")')

# resources / attachments
resources = self.map_resources_by_hash(note)
Expand Down
2 changes: 1 addition & 1 deletion src/formats/zoho_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def convert_note(self, file_: Path):
metadata[key] = json.loads(value)

title = metadata["data-notecard"]["name"]
self.logger.debug(f'Converting note "{title}"')
self.logger.debug(f'Converting note "{title}" (ID: "{file_.stem}")')

# get or find parent notebook
# Assume that notebooks can't be nested.
Expand Down
58 changes: 39 additions & 19 deletions src/markdown_lib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,28 +263,23 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
# fmt:on


def markup_to_markdown(
text: bytes | str, format_: str = "html", resource_folder: Path = Path("tmp_media")
) -> str:
# Route everything through this function to get a single path of truth.
if format_ == "html":
# some needed preprocessing
soup = BeautifulSoup(text, "html.parser")
markdown_lib.html_preprocessing.div_checklists(soup)
markdown_lib.html_preprocessing.highlighting(soup)
markdown_lib.html_preprocessing.iframes_to_links(soup)
markdown_lib.html_preprocessing.streamline_tables(soup)
markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup)
markdown_lib.html_preprocessing.whitespace_in_math(soup)
text = str(soup)
def html_to_markdown(text_html: bytes | str):
# some needed preprocessing
soup = BeautifulSoup(text_html, "html.parser")
markdown_lib.html_preprocessing.div_checklists(soup)
markdown_lib.html_preprocessing.highlighting(soup)
markdown_lib.html_preprocessing.iframes_to_links(soup)
markdown_lib.html_preprocessing.streamline_tables(soup)
markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup)
markdown_lib.html_preprocessing.whitespace_in_math(soup)
text_html_filtered = str(soup)

# writer: json ast -> markdown
text_md = pypandoc.convert_text(
text,
text_html_filtered,
PANDOC_OUTPUT_FORMAT,
format=format_,
sandbox=True,
format="html",
extra_args=[
# somehow the temp folder is needed to create the resources properly
f"--extract-media={resource_folder}",
# don't create artificial line breaks
"--wrap=none",
],
Expand All @@ -295,6 +290,31 @@ def markup_to_markdown(
text_md = text_md.replace("{TEMPORARYNEWLINE}", "<br>")
return text_md.strip()

def markup_to_markdown(
text: bytes | str, format_: str = "html", resource_folder: Path = Path("tmp_media")
) -> str:
# Route everything through this function to get a single path of truth.
if format_.startswith("html"):
text_html = text
else:
# reader: x -> HTML
text_html = pypandoc.convert_text(
text,
"html",
format=format_,
sandbox=True,
extra_args=[
# somehow the temp folder is needed to create the resources properly
f"--extract-media={resource_folder}",
# don't create artificial line breaks
"--wrap=none",
],
)

# HTML filter: HTML -> filter -> HTML
# writer: HTML -> Markdown
return html_to_markdown(text_html)


# Problem: "//" is part of many URI (between scheme and host).
# We need to exclude them to prevent unwanted conversions.
Expand Down
12 changes: 11 additions & 1 deletion src/markdown_lib/html_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
"""HTML preprocessing functions to prepare for Pandoc conversion."""
"""
HTML preprocessing functions to prepare for Pandoc conversion.
Should be used:
- For format specific conversions.
- If they can't be expressed in another way.
"""

import logging
import string
Expand Down Expand Up @@ -146,6 +152,10 @@ def streamline_tables(soup: BeautifulSoup):
# if not isinstance(c, str) or c.strip()
# ]

# Remove nested tables.
for nested_table in table.find_all("table"):
nested_table.unwrap() # TODO: revisit

# Remove all divs, since they cause pandoc to fail converting the table.
# https://stackoverflow.com/a/32064299/7410886
tags_to_remove = ["div", "span"]
Expand Down

0 comments on commit 40cc52c

Please sign in to comment.