hukkin · hukkin · Oct 11, 2022 · Oct 11, 2022 · Dec 15, 2022 · Nov 11, 2024
diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
@@ -2,6 +2,7 @@
 
 from collections.abc import Iterable, Mapping
 from contextlib import nullcontext
+from html.parser import HTMLParser
 import re
 from types import MappingProxyType
 from typing import Any, Literal
@@ -102,6 +103,82 @@ def is_md_equal(
     return html_texts["md1"] == html_texts["md2"]
 
 
+# TODO: remove empty p tags, remove formatted code
+def normalize_html_ast(ast: list[dict]) -> list[dict]:
+    raise NotImplementedError
+
+
+class HTML2AST(HTMLParser):
+    """Parse HTML to a list/dict structure that can be used in comparisons.
+
+    HTML2AST.parse() is the only public interface.
+    """
+
+    def __init__(self) -> None:
+        HTMLParser.__init__(self, convert_charrefs=True)
+
+    def reset(self) -> None:
+        """This is called by HTMLParser.__init__."""
+        HTMLParser.reset(self)
+        self.tree: list[dict] = []
+        self.current: dict | None = None
+
+    def parse(self, text: str, strip_classes: Iterable[str] = ()) -> list[dict]:
+        self.reset()
+        self.feed(text)
+        self.close()
+        self.strip_classes(self.tree, set(strip_classes))
+        return self.tree
+
+    # TODO: remove?
+    def strip_classes(self, tree: list[dict], classes: set[str]) -> list[dict]:
+        """Strip content from tags with certain classes."""
+        items = []
+        for item in tree:
+            if set(item["attrs"].get("class", "").split()).intersection(classes):
+                items.append({"tag": item["tag"], "attrs": item["attrs"]})
+                continue
+            items.append(item)
+            item["children"] = self.strip_classes(item.get("children", []), classes)
+            if not item["children"]:
+                item.pop("children")
+
+        return items
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag_item = {"tag": tag, "attrs": dict(attrs), "parent": self.current}
+        if self.current is None:
+            self.tree.append(tag_item)
+        else:
+            children = self.current.setdefault("children", [])
+            children.append(tag_item)
+        self.current = tag_item
+
+    def handle_endtag(self, tag: str) -> None:
+        # walk up the tree to the tag's parent
+        while self.current is not None:
+            if self.current["tag"] == tag:
+                self.current = self.current.pop("parent")
+                break
+            self.current = self.current.pop("parent")
+
+    def handle_data(self, data: str) -> None:
+        # ignore data outside tags
+        if self.current is None:
+            return
+
+        if self.current["tag"] == "p":
+            # Strip insignificant paragraph leading/trailing whitespace
+            data = data.strip()
+            # Reduce all collapsable whitespace to a single space
+            data = re.sub(r"[\n\t ]+", " ", data)
+
+        if "data" in self.current:
+            self.current["data"].append(data)
+        else:
+            self.current["data"] = [data]
+
+
 def detect_newline_type(md: str, eol_setting: str) -> Literal["\n", "\r\n"]:
     """Returns the newline-character to be used for output.
 

diff --git a/tests/test_html2ast.py b/tests/test_html2ast.py
@@ -0,0 +1,109 @@
+from mdformat._util import HTML2AST
+
+
+def test_html2ast():
+    data = HTML2AST().parse('<div><p class="x">a<s>j</s></p></div><a>b</a>')
+    assert data == [
+        {
+            "tag": "div",
+            "attrs": {},
+            "children": [
+                {
+                    "tag": "p",
+                    "attrs": {"class": "x"},
+                    "data": ["a"],
+                    "children": [{"tag": "s", "attrs": {}, "data": ["j"]}],
+                }
+            ],
+        },
+        {"tag": "a", "attrs": {}, "data": ["b"]},
+    ]
+
+
+def test_html2ast_multiline():
+    data = HTML2AST().parse("<div>a\nb \nc \n\n</div>")
+    assert data == [{"tag": "div", "attrs": {}, "data": ["a\nb \nc \n\n"]}]
+
+
+def test_html2ast_nested():
+    data = HTML2AST().parse("<a d=1>b<a d=2>c<a d=3>e</a></a></a>")
+    assert data == [
+        {
+            "tag": "a",
+            "attrs": {"d": "1"},
+            "data": ["b"],
+            "children": [
+                {
+                    "tag": "a",
+                    "attrs": {"d": "2"},
+                    "data": ["c"],
+                    "children": [{"tag": "a", "attrs": {"d": "3"}, "data": ["e"]}],
+                }
+            ],
+        }
+    ]
+
+
+def test_html2ast_strip():
+    data = HTML2AST().parse('<div><p class="x y">a<s>j</s></p></div><a>b</a>', {"x"})
+    assert data == [
+        {
+            "tag": "div",
+            "attrs": {},
+            "children": [{"tag": "p", "attrs": {"class": "x y"}}],
+        },
+        {"tag": "a", "attrs": {}, "data": ["b"]},
+    ]
+
+
+def test_html2ast_multiple_content():
+    data = HTML2AST().parse(
+        """
+<div>
+hello
+
+<p class="x y">a</p>
+<p class="a b"></p>
+
+   another  hello  in  the same div
+this one is multiline
+</div>
+""",
+    )
+    assert data == [
+        {
+            "tag": "div",
+            "attrs": {},
+            "children": [
+                {"tag": "p", "attrs": {"class": "x y"}, "data": ["a"]},
+                {"tag": "p", "attrs": {"class": "a b"}},
+            ],
+            "data": [
+                "\nhello\n\n",
+                "\n",
+                """
+
+   another  hello  in  the same div
+this one is multiline
+""",
+            ],
+        },
+    ]
+
+
+def test_html2ast_empty_paragraphs():
+    data = HTML2AST().parse(
+        """
+<p></p>
+<p>a</p>
+<p>
+</p>
+<p> </p>
+""",
+    )
+    assert data == [
+        {"tag": "p", "attrs": {}},
+        {"tag": "p", "attrs": {}, "data": ["a"]},
+        {"tag": "p", "attrs": {}, "data": [""]},
+        {"tag": "p", "attrs": {}, "data": [""]},
+    ]