From f6ce9c95a374b7aafd15cab080a2c7a8db435984 Mon Sep 17 00:00:00 2001 From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:45:15 +0200 Subject: [PATCH] improve: improve import time with lazy regex and imports (#506) --- pyproject.toml | 7 ++++++ src/mdformat/codepoints/__init__.py | 3 ++- src/mdformat/plugins.py | 4 ++-- src/mdformat/renderer/__init__.py | 10 ++++---- src/mdformat/renderer/_context.py | 4 ++-- src/mdformat/renderer/_util.py | 37 ++++++++++++++++++----------- tests/test_api.py | 9 +++++++ tests/utils.py | 8 +++++-- 8 files changed, 57 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bc528773..7981aab0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,6 +156,13 @@ commands = [ ["python", "-m", "timeit", "from mdformat._cli import run", 'run(["README.md", "docs/", "--check", "--wrap", "50"])'], ] +[tool.tox.env."benchmark-import"] +description = "Measure module import times. Tox sends mdformat output to stderr, so to filter use e.g. `tox -e benchmark-import 2> >(grep mdformat)`." +deps = [] +commands = [ + ["python", "-X", "importtime", "-m", "mdformat"], +] + [tool.coverage.run] source = ["mdformat"] diff --git a/src/mdformat/codepoints/__init__.py b/src/mdformat/codepoints/__init__.py index 0559849c..0491ae6b 100644 --- a/src/mdformat/codepoints/__init__.py +++ b/src/mdformat/codepoints/__init__.py @@ -5,7 +5,6 @@ "ASCII_WHITESPACE", ) -import warnings from mdformat.codepoints._unicode_punctuation import UNICODE_PUNCTUATION from mdformat.codepoints._unicode_whitespace import UNICODE_WHITESPACE @@ -19,6 +18,8 @@ def __getattr__(name: str) -> frozenset[str]: Used during the deprecation period of `ASCII_WHITESPACE`. """ if name == "ASCII_WHITESPACE": + import warnings + warnings.warn( "ASCII_WHITESPACE is deprecated because CommonMark v0.30 no longer " "defines ASCII whitespace.", diff --git a/src/mdformat/plugins.py b/src/mdformat/plugins.py index 8da514ae..bfefe47e 100644 --- a/src/mdformat/plugins.py +++ b/src/mdformat/plugins.py @@ -4,11 +4,11 @@ from collections.abc import Callable, Mapping from typing import TYPE_CHECKING, Any, Protocol -from markdown_it import MarkdownIt - from mdformat._compat import importlib_metadata if TYPE_CHECKING: + from markdown_it import MarkdownIt + from mdformat.renderer.typing import Postprocess, Render diff --git a/src/mdformat/renderer/__init__.py b/src/mdformat/renderer/__init__.py index a802ab51..c335c378 100644 --- a/src/mdformat/renderer/__init__.py +++ b/src/mdformat/renderer/__init__.py @@ -13,13 +13,15 @@ import logging import string from types import MappingProxyType -from typing import Any - -from markdown_it.token import Token +from typing import TYPE_CHECKING, Any from mdformat.renderer._context import DEFAULT_RENDERERS, WRAP_POINT, RenderContext from mdformat.renderer._tree import RenderTreeNode -from mdformat.renderer.typing import Postprocess + +if TYPE_CHECKING: + from markdown_it.token import Token + + from mdformat.renderer.typing import Postprocess LOGGER = logging.getLogger(__name__) diff --git a/src/mdformat/renderer/_context.py b/src/mdformat/renderer/_context.py index 1e2af49d..1e5a6665 100644 --- a/src/mdformat/renderer/_context.py +++ b/src/mdformat/renderer/_context.py @@ -15,7 +15,6 @@ from mdformat import codepoints from mdformat._conf import DEFAULT_OPTS from mdformat.renderer._util import ( - RE_CHAR_REFERENCE, decimalify_leading, decimalify_trailing, escape_asterisk_emphasis, @@ -27,6 +26,7 @@ is_tight_list_item, longest_consecutive_sequence, maybe_add_link_brackets, + re_char_reference, ) if TYPE_CHECKING: @@ -137,7 +137,7 @@ def text(node: RenderTreeNode, context: RenderContext) -> str: # Escape "&" if it starts a sequence that can be interpreted as # a character reference. - text = RE_CHAR_REFERENCE.sub(r"\\\g<0>", text) + text = re_char_reference().sub(r"\\\g<0>", text) # The parser can give us consecutive newlines which can break # the markdown structure. Replace two or more consecutive newlines diff --git a/src/mdformat/renderer/_util.py b/src/mdformat/renderer/_util.py index 45a17a4d..6532cc59 100644 --- a/src/mdformat/renderer/_util.py +++ b/src/mdformat/renderer/_util.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Iterable +import functools import html.entities import re from typing import TYPE_CHECKING @@ -10,20 +11,28 @@ if TYPE_CHECKING: from mdformat.renderer import RenderTreeNode -# Regex that finds character references. -# The reference can be either -# 1. decimal representation, e.g. -# 2. hex representation, e.g.  -# 3. HTML5 entity reference, e.g.   -RE_CHAR_REFERENCE = re.compile( - "&(?:" - + "#[0-9]{1,7}" - + "|" - + "#[Xx][0-9A-Fa-f]{1,6}" - + "|" - + "|".join({c.rstrip(";") for c in html.entities.html5}) - + ");" -) + +@functools.cache +def re_char_reference() -> re.Pattern[str]: + """Return a regex that finds character references. + + The reference can be either: + 1. decimal representation, e.g. + 2. hex representation, e.g.  + 3. HTML5 entity reference, e.g.   + + This cached function compiles the regex lazily, + as compilation can take over 20ms. + """ + return re.compile( + "&(?:" + + "#[0-9]{1,7}" + + "|" + + "#[Xx][0-9A-Fa-f]{1,6}" + + "|" + + "|".join({c.rstrip(";") for c in html.entities.html5}) + + ");" + ) def is_tight_list(node: RenderTreeNode) -> bool: diff --git a/tests/test_api.py b/tests/test_api.py index ce900494..a7c58e03 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -146,3 +146,12 @@ def test_mdrenderer_no_finalize(tmp_path): def test_ascii_whitespace_deprecation(): with pytest.warns(DeprecationWarning): mdformat.codepoints.ASCII_WHITESPACE + + +def test_import_typing(): + """Try to import mdformat.renderer.typing. + + The module consists of annotation types only, so mdformat never + imports it at runtime. This test ensures that it still runs. + """ + import mdformat.renderer.typing # noqa: F401 diff --git a/tests/utils.py b/tests/utils.py index 3ab54a7b..35612274 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,11 +1,15 @@ -import json +from __future__ import annotations -from markdown_it import MarkdownIt +import json +from typing import TYPE_CHECKING from mdformat._cli import run from mdformat._conf import read_toml_opts from mdformat.renderer import RenderContext, RenderTreeNode +if TYPE_CHECKING: + from markdown_it import MarkdownIt + UNFORMATTED_MARKDOWN = "\n\n# A header\n\n" FORMATTED_MARKDOWN = "# A header\n"