Skip to content

Commit

Permalink
improve: improve import time with lazy regex and imports (#506)
Browse files Browse the repository at this point in the history
  • Loading branch information
hukkin authored Jan 8, 2025
1 parent 27bce74 commit f6ce9c9
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 25 deletions.
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,13 @@ commands = [
["python", "-m", "timeit", "from mdformat._cli import run", 'run(["README.md", "docs/", "--check", "--wrap", "50"])'],
]

[tool.tox.env."benchmark-import"]
description = "Measure module import times. Tox sends mdformat output to stderr, so to filter use e.g. `tox -e benchmark-import 2> >(grep mdformat)`."
deps = []
commands = [
["python", "-X", "importtime", "-m", "mdformat"],
]


[tool.coverage.run]
source = ["mdformat"]
Expand Down
3 changes: 2 additions & 1 deletion src/mdformat/codepoints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"ASCII_WHITESPACE",
)

import warnings

from mdformat.codepoints._unicode_punctuation import UNICODE_PUNCTUATION
from mdformat.codepoints._unicode_whitespace import UNICODE_WHITESPACE
Expand All @@ -19,6 +18,8 @@ def __getattr__(name: str) -> frozenset[str]:
Used during the deprecation period of `ASCII_WHITESPACE`.
"""
if name == "ASCII_WHITESPACE":
import warnings

warnings.warn(
"ASCII_WHITESPACE is deprecated because CommonMark v0.30 no longer "
"defines ASCII whitespace.",
Expand Down
4 changes: 2 additions & 2 deletions src/mdformat/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any, Protocol

from markdown_it import MarkdownIt

from mdformat._compat import importlib_metadata

if TYPE_CHECKING:
from markdown_it import MarkdownIt

from mdformat.renderer.typing import Postprocess, Render


Expand Down
10 changes: 6 additions & 4 deletions src/mdformat/renderer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
import logging
import string
from types import MappingProxyType
from typing import Any

from markdown_it.token import Token
from typing import TYPE_CHECKING, Any

from mdformat.renderer._context import DEFAULT_RENDERERS, WRAP_POINT, RenderContext
from mdformat.renderer._tree import RenderTreeNode
from mdformat.renderer.typing import Postprocess

if TYPE_CHECKING:
from markdown_it.token import Token

from mdformat.renderer.typing import Postprocess

LOGGER = logging.getLogger(__name__)

Expand Down
4 changes: 2 additions & 2 deletions src/mdformat/renderer/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from mdformat import codepoints
from mdformat._conf import DEFAULT_OPTS
from mdformat.renderer._util import (
RE_CHAR_REFERENCE,
decimalify_leading,
decimalify_trailing,
escape_asterisk_emphasis,
Expand All @@ -27,6 +26,7 @@
is_tight_list_item,
longest_consecutive_sequence,
maybe_add_link_brackets,
re_char_reference,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -137,7 +137,7 @@ def text(node: RenderTreeNode, context: RenderContext) -> str:

# Escape "&" if it starts a sequence that can be interpreted as
# a character reference.
text = RE_CHAR_REFERENCE.sub(r"\\\g<0>", text)
text = re_char_reference().sub(r"\\\g<0>", text)

# The parser can give us consecutive newlines which can break
# the markdown structure. Replace two or more consecutive newlines
Expand Down
37 changes: 23 additions & 14 deletions src/mdformat/renderer/_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from collections.abc import Iterable
import functools
import html.entities
import re
from typing import TYPE_CHECKING
Expand All @@ -10,20 +11,28 @@
if TYPE_CHECKING:
from mdformat.renderer import RenderTreeNode

# Regex that finds character references.
# The reference can be either
# 1. decimal representation, e.g. &#11;
# 2. hex representation, e.g. &#x1e;
# 3. HTML5 entity reference, e.g. &nbsp;
RE_CHAR_REFERENCE = re.compile(
"&(?:"
+ "#[0-9]{1,7}"
+ "|"
+ "#[Xx][0-9A-Fa-f]{1,6}"
+ "|"
+ "|".join({c.rstrip(";") for c in html.entities.html5})
+ ");"
)

@functools.cache
def re_char_reference() -> re.Pattern[str]:
"""Return a regex that finds character references.
The reference can be either:
1. decimal representation, e.g. &#11;
2. hex representation, e.g. &#x1e;
3. HTML5 entity reference, e.g. &nbsp;
This cached function compiles the regex lazily,
as compilation can take over 20ms.
"""
return re.compile(
"&(?:"
+ "#[0-9]{1,7}"
+ "|"
+ "#[Xx][0-9A-Fa-f]{1,6}"
+ "|"
+ "|".join({c.rstrip(";") for c in html.entities.html5})
+ ");"
)


def is_tight_list(node: RenderTreeNode) -> bool:
Expand Down
9 changes: 9 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,12 @@ def test_mdrenderer_no_finalize(tmp_path):
def test_ascii_whitespace_deprecation():
with pytest.warns(DeprecationWarning):
mdformat.codepoints.ASCII_WHITESPACE


def test_import_typing():
"""Try to import mdformat.renderer.typing.
The module consists of annotation types only, so mdformat never
imports it at runtime. This test ensures that it still runs.
"""
import mdformat.renderer.typing # noqa: F401
8 changes: 6 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import json
from __future__ import annotations

from markdown_it import MarkdownIt
import json
from typing import TYPE_CHECKING

from mdformat._cli import run
from mdformat._conf import read_toml_opts
from mdformat.renderer import RenderContext, RenderTreeNode

if TYPE_CHECKING:
from markdown_it import MarkdownIt

UNFORMATTED_MARKDOWN = "\n\n# A header\n\n"
FORMATTED_MARKDOWN = "# A header\n"

Expand Down

0 comments on commit f6ce9c9

Please sign in to comment.