Skip to content

Commit

Permalink
ODT: add more test data and improve conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
marph91 committed Jan 23, 2025
1 parent b26cbb3 commit 3a075f6
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
12 changes: 7 additions & 5 deletions src/markdown_lib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
}

# fmt: off
INTERMEDIATE_FORMAT = "html"
PANDOC_OUTPUT_FORMAT = (
# https://pandoc.org/chunkedhtml-demo/8.22-markdown-variants.html
# Don't use "commonmark_x". There is too much noise.
Expand Down Expand Up @@ -267,21 +268,22 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
def html_to_markdown(text_html: bytes | str, custom_filter: list | None = None):
# some needed preprocessing
soup = BeautifulSoup(text_html, "html.parser")
if custom_filter is not None:
for filter_ in custom_filter:
filter_(soup)
markdown_lib.html_filter.div_checklists(soup)
markdown_lib.html_filter.highlighting(soup)
markdown_lib.html_filter.iframes_to_links(soup)
markdown_lib.html_filter.merge_single_element_lists(soup)
markdown_lib.html_filter.streamline_tables(soup)
markdown_lib.html_filter.whitespace_in_math(soup)
if custom_filter is not None:
for filter_ in custom_filter:
filter_(soup)
text_html_filtered = str(soup)

# writer: json ast -> markdown
text_md = pypandoc.convert_text(
text_html_filtered,
PANDOC_OUTPUT_FORMAT,
format="html",
format=INTERMEDIATE_FORMAT,
extra_args=[
# don't create artificial line breaks
"--wrap=none",
Expand All @@ -307,7 +309,7 @@ def markup_to_markdown(
# reader: x -> HTML
text_html = pypandoc.convert_text(
text,
"html",
INTERMEDIATE_FORMAT,
format=format_,
sandbox=True,
extra_args=[
Expand Down
1 change: 1 addition & 0 deletions test/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def test_formats(self, test_input):
["html", ["default_format/html"]],
["latex", ["default_format/latex"]],
["mediawiki", ["default_format/mediawiki"]],
["odt", ["default_format/odt"]],
["txt2tags", ["default_format/txt2tags-2"]],
["vimwiki", ["default_format/vimwiki"]],
]
Expand Down

0 comments on commit 3a075f6

Please sign in to comment.