From 28ffd6eb61489c91c9469fe7646afb69d75c394e Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 30 May 2024 19:43:53 +0100 Subject: [PATCH] Enforce compact JSON serialization --- .flake8 | 1 + pyproject.toml | 3 ++- src/dom_tokenizers/diff.py | 4 ++-- src/dom_tokenizers/dump.py | 6 +++--- src/dom_tokenizers/internal/json.py | 18 ++++++++++++++++++ .../pre_tokenizers/dom_snapshot.py | 3 +-- src/dom_tokenizers/pre_tokenizers/splitter.py | 2 +- src/dom_tokenizers/profile.py | 4 ++-- src/dom_tokenizers/train.py | 2 +- .../dom_snapshot/test_pre_tokenizer.py | 6 ++---- tests/test_tokenization.py | 4 +--- tests/util.py | 2 ++ 12 files changed, 36 insertions(+), 19 deletions(-) create mode 100644 src/dom_tokenizers/internal/json.py diff --git a/.flake8 b/.flake8 index fba744a..f9e4225 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,6 @@ [flake8] exclude = .git,__pycache__,venv*,.venv*,build,dist,.local,.#*,#*,*~ +restricted_packages = json inline-quotes = " per-file-ignores = # imported but unused diff --git a/pyproject.toml b/pyproject.toml index 7079651..2ea3b9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dom-tokenizers" -version = "0.0.13" +version = "0.0.14" authors = [{ name = "Gary Benson", email = "gary@gbenson.net" }] description = "DOM-aware tokenization for 🤗 Hugging Face language models" readme = "README.md" @@ -37,6 +37,7 @@ dev = [ "build", "datasets", "flake8", + "flake8-custom-import-rules", "flake8-quotes", "pillow", "pytest", diff --git a/src/dom_tokenizers/diff.py b/src/dom_tokenizers/diff.py index 4d64d22..8e42ae9 100644 --- a/src/dom_tokenizers/diff.py +++ b/src/dom_tokenizers/diff.py @@ -1,9 +1,9 @@ -import json import warnings from argparse import ArgumentParser from difflib import SequenceMatcher +from .internal import json from .internal.transformers import AutoTokenizer from .pre_tokenizers import DOMSnapshotPreTokenizer @@ -105,7 +105,7 @@ def main(): for line in open(args.reference).readlines(): row = json.loads(line) source_index = row["source_index"] - serialized = json.dumps(row["dom_snapshot"], separators=(",", ":")) + serialized = json.dumps(row["dom_snapshot"]) b = tokenizer.tokenize(serialized) a = row["tokenized"] if b == a: diff --git a/src/dom_tokenizers/dump.py b/src/dom_tokenizers/dump.py index f8bd591..452d105 100644 --- a/src/dom_tokenizers/dump.py +++ b/src/dom_tokenizers/dump.py @@ -1,10 +1,10 @@ -import json import warnings from argparse import ArgumentParser from datasets import load_dataset +from .internal import json from .internal.transformers import AutoTokenizer from .pre_tokenizers import DOMSnapshotPreTokenizer @@ -35,11 +35,11 @@ def main(): dataset = load_dataset(args.dataset, split=args.split) rows = ((row["source_index"], row["dom_snapshot"]) for row in dataset) - rows = ((si, ss, json.dumps(ss, separators=(",", ":"))) for si, ss in rows) + rows = ((si, ss, json.dumps(ss)) for si, ss in rows) rows = ((len(ser), si, ss, ser) for si, ss, ser in rows) for _, source_index, dom_snapshot, serialized in sorted(rows): print(json.dumps(dict( source_index=source_index, dom_snapshot=dom_snapshot, tokenized=tokenizer.tokenize(serialized) - ), separators=(",", ":"))) + ))) diff --git a/src/dom_tokenizers/internal/json.py b/src/dom_tokenizers/internal/json.py new file mode 100644 index 0000000..5ff82bc --- /dev/null +++ b/src/dom_tokenizers/internal/json.py @@ -0,0 +1,18 @@ +from json import * # noqa: F403, CIR107 + + +# Default to compact serialization. + +def __wrap(func): + def wrapper(*args, **kwargs): + new_kwargs = {"separators": (",", ":")} + new_kwargs.update(kwargs) + return func(*args, **new_kwargs) + wrapper.__name__ = func.__name__ + wrapper.__doc__ = func.__doc__ + return wrapper + + +dump = __wrap(dump) # noqa: F405 +dumps = __wrap(dumps) # noqa: F405 +del __wrap diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py index d99f2d4..28ceffc 100644 --- a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py +++ b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py @@ -1,10 +1,9 @@ -import json - from dataclasses import make_dataclass from xml.dom import Node from tokenizers import NormalizedString +from ..internal import json from .compat_itertools import batched from .html import is_void_element from .pre_tokenizer import PreTokenizer diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index cb49008..2d5d8cb 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -1,4 +1,3 @@ -import json import re from base64 import b64decode @@ -12,6 +11,7 @@ from unidecode import unidecode +from ..internal import json _B64_RE_S = r"(?:[A-Za-z0-9+/]{4}){" _B64_RE_E = r",}(?:[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)?" diff --git a/src/dom_tokenizers/profile.py b/src/dom_tokenizers/profile.py index 7b84d1c..21fc327 100644 --- a/src/dom_tokenizers/profile.py +++ b/src/dom_tokenizers/profile.py @@ -1,5 +1,4 @@ import cProfile as profile -import json import os import time import warnings @@ -10,6 +9,7 @@ from datasets import load_dataset from tokenizers import NormalizedString +from .internal import json from .internal.transformers import AutoTokenizer from .pre_tokenizers import DOMSnapshotPreTokenizer @@ -83,7 +83,7 @@ def main(): os.makedirs(os.path.dirname(cache_filename), exist_ok=True) with open(cache_filename, "w") as fp: for row in training_dataset: - json.dump(row["dom_snapshot"], fp, separators=(",", ":")) + json.dump(row["dom_snapshot"], fp) fp.write("\n") del training_dataset diff --git a/src/dom_tokenizers/train.py b/src/dom_tokenizers/train.py index 849e4ef..055ade0 100644 --- a/src/dom_tokenizers/train.py +++ b/src/dom_tokenizers/train.py @@ -1,4 +1,3 @@ -import json import os import warnings @@ -9,6 +8,7 @@ from tokenizers import AddedToken from tokenizers.pre_tokenizers import WhitespaceSplit +from .internal import json from .internal.transformers import AutoTokenizer from .pre_tokenizers import DOMSnapshotPreTokenizer diff --git a/tests/pre_tokenizers/dom_snapshot/test_pre_tokenizer.py b/tests/pre_tokenizers/dom_snapshot/test_pre_tokenizer.py index 6f49a19..386260d 100644 --- a/tests/pre_tokenizers/dom_snapshot/test_pre_tokenizer.py +++ b/tests/pre_tokenizers/dom_snapshot/test_pre_tokenizer.py @@ -1,6 +1,4 @@ -import json - -from ...util import load_resource +from ...util import load_resource, json def test_raw_response_unwrapping(pre_tokenizer): @@ -13,7 +11,7 @@ def test_raw_response_unwrapping(pre_tokenizer): assert set(browser_response.keys()) == {"id", "result", "sessionId"} regular_snapshot = browser_response["result"] assert set(regular_snapshot.keys()) == {"documents", "strings"} - regular_snapshot = json.dumps(regular_snapshot, separators=(",", ":")) + regular_snapshot = json.dumps(regular_snapshot) assert regular_snapshot in wrapped_snapshot del browser_response diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py index ce3a6f6..bb43729 100644 --- a/tests/test_tokenization.py +++ b/tests/test_tokenization.py @@ -1,10 +1,8 @@ -import json - from datasets import Dataset from dom_tokenizers.train import train_tokenizer, DEFAULT_VOCAB_SIZE -from .util import load_resource +from .util import load_resource, json def test_base64(dom_snapshot_tokenizer): diff --git a/tests/util.py b/tests/util.py index dd1291f..aa3218b 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,5 +1,7 @@ import os +from dom_tokenizers.internal import json # noqa: F401 + def get_resource_filename(filename, *, ext=None): if ext and not filename.endswith(ext):