diff --git a/pyproject.toml b/pyproject.toml index 2ea3b9c..6e7a9d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,8 +23,10 @@ classifiers = [ "Topic :: Text Processing :: Markup :: HTML", ] dependencies = [ + "numpy", "python-magic", # XXX review "tokenizers", + "transformers", "unidecode", # XXX review ] @@ -42,12 +44,10 @@ dev = [ "pillow", "pytest", "pytest-cov", - "transformers", ] train = [ "datasets", "pillow", - "transformers", ] [project.scripts] diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..24297e6 --- /dev/null +++ b/runner.py @@ -0,0 +1,53 @@ +import sys +import warnings + +from itertools import chain + +from dom_tokenizers.internal import json +from dom_tokenizers.pre_tokenizers.shared_oracle import SharedOracle + +DEFAULT_TESTCASES = [ + "overflow", + "uniqueid", + "uniqueId", + "uniqueID", + "pagewrap", + "pageWrap", + "autocompletetype", + "autocompleteType", + "backfill", + "Inauspicious", + "Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch", + + "1655885421832", # first token is 4 chars + "8eb5e30dac7d493298287704a5f578c7", + "next/static/css/99762953f4d03581", + "org/TR/xhtml1/DTD/xhtml1", + "KFOmCnqEu92Fr1Mu4mxK", + "electronically8eb5e30dac7", # median chars/token = 1.0 (mean=2.7), + "electronically8eb5e30dac", # median chars/token = 1.5 (mean=3.0) + "electronically8eb5e30da", # median chars/token = 2.0 (mean=3.3) +] + + +def main(): + warnings.filterwarnings("ignore", message=r".*resume_download.*") + + oracle = SharedOracle() + if len(sys.argv) < 2: + lines = DEFAULT_TESTCASES + else: + lines = chain.from_iterable( + (json.loads(line)["text"] + for line in open(filename).readlines()) + for filename in sys.argv[1:]) + + for line in lines: + print("input:", line) + result = oracle.split_if_trivial(line, log_unhandled=False) + if result is not None: + print(f"\x1B[32m{result}\x1B[0m\n") + + +if __name__ == "__main__": + main() diff --git a/src/dom_tokenizers/pre_tokenizers/oracle.py b/src/dom_tokenizers/pre_tokenizers/oracle.py new file mode 100644 index 0000000..3b14644 --- /dev/null +++ b/src/dom_tokenizers/pre_tokenizers/oracle.py @@ -0,0 +1,141 @@ +import re + +from typing import Optional, Callable + +import numpy as np + +from ..internal import jsonl +from ..internal.transformers import AutoTokenizer + +_IntOrIntList = int | list[int] +_StrOrStrList = str | list[str] + + +class Oracle: + def __init__(self, *args, **kwargs): + self._tok = AutoTokenizer.from_pretrained(*args, **kwargs) + self._tok.model_max_length = 1 << 31 + self.cls_token_id = self._tok.cls_token_id + self.sep_token_id = self._tok.sep_token_id + self.unk_token_id = self._tok.unk_token_id + self.max_token_len = max( + len(token) for token in self._tok.vocab + ) + self.max_try_split_len = min(self.max_token_len * 5, 100) + self._log = jsonl.Writer(basename="oracle", with_timestamp=True) + + def close(self): + self._log.close() + + @property + def normalize_str(self) -> Callable[[str], str]: + """Normalize the given string. + """ + return self._tok.backend_tokenizer.normalizer.normalize_str + + def encode(self, *args, **kwargs) -> list[int]: + """Convert the given string to a list of integer token IDs. + """ + token_ids = self._tok.encode(*args, **kwargs) + assert token_ids[0] == self.cls_token_id + assert token_ids[-1] == self.sep_token_id + return token_ids[1:-1] + + IDsToTokensType = Callable[[_IntOrIntList], _StrOrStrList] + + @property + def convert_ids_to_tokens(self, *args, **kwargs) -> IDsToTokensType: + """Convert the given list of token IDs to a list of tokens. + """ + return self._tok.convert_ids_to_tokens + + def tokenize(self, *args, **kwargs) -> list[str]: + """Convert the given string into a list of tokens. + """ + return self.convert_ids_to_tokens(self.encode(*args, **kwargs)) + + @property + def decode(self) -> Callable[[_IntOrIntList], str]: + """Convert the given list of token IDs to a string. + """ + return self._tok.decode + + # For quick checks, see TextSplitter.BASE64_RE for the real deal + _LOOSE_BASE64_RE = re.compile(r"^[A-Za-z0-9+/]+={0,2}$") + + def split_if_trivial( + self, + text: str, + log_unhandled: bool = True, # XXX + ) -> Optional[list[str]]: + """Split a string into a list of tokens XXX IF! + + Like `tokenize()` but it only returns if XXX. Otherwise None is + returned. + """ + if len(text) > self.max_try_split_len: + return None + + # Fast path for text that's in the oracle's vocabulary. + if len(text) <= self.max_token_len and ( + (text in self._tok.vocab + or text.lower() in self._tok.vocab) + and text.isalnum()): + return [text] + + # Limit ourselves to base64-ish input, for now at least. + if not self._LOOSE_BASE64_RE.match(text): + raise NotImplementedError(text) + + token_ids = self.encode(text) + if not token_ids or self.unk_token_id in token_ids: + return None + + tokens = self.convert_ids_to_tokens(token_ids) + word_pieces = [token.lstrip("#") for token in tokens] + token_lengths = [len(token) for token in word_pieces] + + # If the tokens are mostly 2+ characters long and the + # input text splits on whitespace in the same places as + # the decoded token ID sequence then call this a match. + # Subtracting the standard deviation prevents situations + # where one long token skews the median away from a load + # of 1-2 character tokens, e.g. "electronically8eb5e30da" + # tokenizes to ["electronically", "8", "eb", "5", "e", + # "30", "da"] with bert-base-uncased, so a median token + # length of 2 characters/token and a mean of 3.3, but + # the standard deviation of 4.4 indicates at least one + # token is very far from the mean. + median_length = np.median(token_lengths) + length_stddev = np.std(token_lengths) + if median_length - length_stddev > 1: + result = text.split() + want = [token.lower() for token in result] + if self.decode(token_ids).split() == want: + return result + + print(f"tokens: {tokens}"[:80]) + + first_token_id = token_ids[0] + first_token = self.convert_ids_to_tokens(first_token_id) + assert "#" not in first_token + print(f"first_token: {first_token!r} ({first_token_id})") + + chars_per_token = len(text) / len(token_ids) + + #mean = sum(token_lengths) / len(token_ids) + print("chars_per_token:", chars_per_token) + #print("or ------> mean:", mean) + print(" median:", median_length) + print(" std.dev:", length_stddev) + print() + + # XXX now what? + if log_unhandled: + self._log.write( + text=text, token_ids=token_ids, + tokens=tokens, + decoded=self.decode(token_ids), + chars_per_token=chars_per_token, + ) + return None diff --git a/src/dom_tokenizers/pre_tokenizers/shared_oracle.py b/src/dom_tokenizers/pre_tokenizers/shared_oracle.py new file mode 100644 index 0000000..9b54ecf --- /dev/null +++ b/src/dom_tokenizers/pre_tokenizers/shared_oracle.py @@ -0,0 +1,18 @@ +import atexit + +from .oracle import Oracle + + +class SharedOracle(Oracle): + _shared_borg_state = {} + + def __new__(cls, *args, **kwargs): + obj = super().__new__(cls) + obj.__dict__ = cls._shared_borg_state + return obj + + def __init__(self, model="bert-base-uncased", *args, **kwargs): + if hasattr(self, "_tok"): + return + super().__init__(model, *args, **kwargs) + atexit.register(self.close) diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index 7729003..b90cc83 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -4,7 +4,7 @@ from base64 import binascii, b64decode from collections import defaultdict from collections.abc import Iterable -from dataclasses import dataclass +from dataclasses import dataclass, field from urllib.parse import unquote import magic @@ -12,6 +12,8 @@ from unidecode import unidecode from ..internal import json +from .oracle import Oracle +from .shared_oracle import SharedOracle from .sniffer import sniff_bytes logger = logging.getLogger(__name__) @@ -44,6 +46,7 @@ class FalseBase64Error(RuntimeError): class TextSplitter: base64_token: str = "[BASE64]" long_token: str = "[LONG]" + oracle: Oracle = field(default_factory=SharedOracle) @property def special_tokens(self) -> Iterable[str]: @@ -424,20 +427,20 @@ def _sub_urlencoded(self, splits, cursor): def _split_base64(self, encoded): try: - encoded = encoded.encode("ascii") + _encoded = encoded.encode("ascii") except UnicodeEncodeError: return None try: - data = b64decode(encoded, validate=True) + data = b64decode(_encoded, validate=True) except binascii.Error: return None try: text = data.decode("utf-8") except UnicodeDecodeError: - return self._split_base64_binary(data) - return self._split_base64_utf8(text) + return self._split_base64_binary(data, encoded) + return self._split_base64_utf8(text, encoded) - def _split_base64_utf8(self, text): + def _split_base64_utf8(self, text, encoded): match = self.XML_HDR_RE.match(text) if match is not None: if match.group(1) == "svg": @@ -448,12 +451,16 @@ def _split_base64_utf8(self, text): return [self.base64_token, "json"] except json.JSONDecodeError: pass + if self.oracle.first_is_better(encoded, text): + return None # encoded is better return [self.base64_token, "text"] - def _split_base64_binary(self, data): + def _split_base64_binary(self, data, encoded): filetype = sniff_bytes(data) if not filetype: - return None + if self.oracle.is_texty(encoded): + return None + return [self.base64_token, "data"] return [self.base64_token, filetype.name.lower()] # XXX junk? diff --git a/tests/test_oracle.py b/tests/test_oracle.py new file mode 100644 index 0000000..78e6265 --- /dev/null +++ b/tests/test_oracle.py @@ -0,0 +1,110 @@ +import pytest + +from dom_tokenizers.pre_tokenizers.shared_oracle import SharedOracle + + +@pytest.mark.parametrize( + ("text,expect_normalized"), + (("hello world", "hello world"), + ("html", "html"), + ("", ""), + ("HTML", "html"), + ("Parse error", "parse error"), + (" html", " html"), + ("html ", "html "), + (": syntax error, unexpected ')' in ", + ": syntax error, unexpected ')' in "), + ("\n", " "), + (": \t syntax error, unexpected ')' in ", + ": syntax error, unexpected ')' in "), + ("\ufeff", ""), + )) +def test_normalizer(text, expect_normalized): + """Check the backend normalizer works as we expect. + + Specifically: + - lowercasing is performed + - leading and trailing whitespace are retained + - sequences of whitespace are not compressed + - all whitespace characters become ASCII space + - punctiation is retained + - BOM is not retained + """ + assert SharedOracle().normalize_str(text) == expect_normalized + + +@pytest.mark.parametrize( + ("text,expect_tokens"), + (("hello world", ["hello", "world"]), + ("html", ["html"]), + ("", []), + ("HTML", ["html"]), + ("Parse error", ["par", "##se", "error"]), + ("宏 error", ["[UNK]", "error"]), + (" html", ["html"]), + ("html ", ["html"]), + (": syntax error, unexpected ')' in ", + [":", "syntax", "error", ",", "unexpected", "'", ")", "'", "in"]), + (": \t syntax error, unexpected ')' in ", + [":", "syntax", "error", ",", "unexpected", "'", ")", "'", "in"]), + ("\ufeff", []), + + # Testcases for split_if_trivial() + ("overflow", ["over", "##flow"]), + ("uniqueid", ["unique", "##id"]), + ("pagewrap", ["page", "##wr", "##ap"]), + ("autocompletetype", ["auto", "##com", "##ple", "##tet", "##ype"]), + ("Inauspicious", ["ina", "##us", "##pic", "##ious"]), + ("Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch", + ["ll", "##an", "##fa", "##ir", "##pw", "##ll", "##g", "##wyn", + "##gy", "##ll", "##go", "##ger", "##ych", "##wy", "##rn", + "##dro", "##b", "##wl", "##ll", "##lan", "##ty", "##sil", + "##io", "##go", "##go", "##go", "##ch"]), + ("1655885421832", + ["1655", "##8", "##85", "##42", "##18", "##32"]), + )) +def test_tokenizer(text, expect_tokens): + """Check the backend tokenizer works as expected. + + Specifically: + - normalization is performed as per `test_normalizer()` + - whitespace causes splits but is not retained + - unhandled input is substituted with [UNK] + - result is not bracketed by [CLS], [SEP] + """ + assert SharedOracle().tokenize(text) == expect_tokens + + +@pytest.mark.parametrize( + ("text,expect_tokens"), + (("overflow", ["overflow"]), + ("uniqueid", ["uniqueid"]), + ("uniqueId", ["uniqueId"]), + ("uniqueID", ["uniqueID"]), + ("pagewrap", ["pagewrap"]), + ("autocompletetype", ["autocompletetype"]), + ("Inauspicious", ["Inauspicious"]), + ("Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch", + ["Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch"]), + #("8eb5e30dac7d493298287704a5f578c7", + # ["[CWCI]"]), + #("next/static/css/99762953f4d03581", + # ["next", "static", "css", "[CWCI]"]), + #("org/TR/xhtml1/DTD/xhtml1", + # ["org", "TR", "xhtml1", "DTD", "xhtml1"]), + #("KFOmCnqEu92Fr1Mu4mxK", ["[CWCI]"]), + #("pageWrap", ["page", "Wrap"]), # XXX maybe? + #("autocompleteType", ["autocomplete", "Type"]), + #("electronically8eb5e30dac7", # median chars/token = 1.0 (mean=2.7) + # ["electronically", "[CWCI]"]), + #("electronically8eb5e30dac", # median chars/token = 1.5 (mean=3.0) + # ["electronically", "[CWCI]"]), + #("electronically8eb5e30da", # median chars/token = 2.0 (mean=3.3) + # ["electronically", "[CWCI]"]), + # ("1655885421832", ["[CWCI]"]), + )) +def test_split_if_trivial(text, expect_tokens): + """Check `Oracle.split_if_trivial()` is doing what it should. + """ + assert SharedOracle().split_if_trivial( + text, log_unhandled=False) == expect_tokens diff --git a/tests/test_splitter.py b/tests/test_splitter.py index 11a96f4..f97e6f4 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -213,6 +213,8 @@ def test_decoding(text, expect_tokens): ["src", "url", "fonts", "gstatic", "com", "s", "roboto", "v18", "KFOmCnqEu92Fr1Mu4mxK", "woff2", "format", "woff2", "unicode", "range", "U", "0000", "00FF"]), + ("0x8eb5e30dac7d493298287704a5f578c7", + ["0x", "[LONG]", "hex", "digits"]), )) def test_regressions(text, expect_tokens): """Check that things we improve stay improved.