diff --git a/pyproject.toml b/pyproject.toml index abc5800..c8c600c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dom-tokenizers" -version = "0.0.15" +version = "0.0.16" authors = [{ name = "Gary Benson", email = "gary@gbenson.net" }] description = "DOM-aware tokenization for 🤗 Hugging Face language models" readme = "README.md" diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index 1af33ff..3cc409a 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -69,6 +69,26 @@ class TextSplitter: def special_tokens(self) -> Iterable[str]: return (v for k, v in self.__dict__.items() if k.endswith("_token")) + _APOSTROPHISH = { + 0x02b9, # Modifier letter prime + 0x02bc, # Modifier letter apostrophe + 0x02bf, # Modifier letter left half ring + 0x02c8, # Modifier letter vertical line + 0x055a, # Armenian apostrophe + 0x05f3, # Hebrew punctuation geresh + 0x1fbd, # Greek koronis + 0x1fbf, # Greek psili + 0x1ffd, # Greek oxia + 0x2018, # Left single quotation mark + 0x2019, # Right single quotation mark + 0x201b, # Single high-reversed-9 quotation mark + 0x2032, # Prime + 0x275c, # Heavy single comma quotation mark ornament + 0xff07, # Fullwidth apostrophe + } + APOSTROPHISH = "".join(map(chr, sorted(_APOSTROPHISH))) + APOSTROPHISH_RE = re.compile(rf"[{_APOSTROPHISH}]") + # Partially split into words, but retain the non-word characters # until everything's de-escaped and base64 is identified. # - `+/=` are allowed within words here to keep base64-encoded @@ -76,8 +96,8 @@ def special_tokens(self) -> Iterable[str]: # - Apostrophes are... included for now XXX # - Underscores are included in "\w", so we have to handle them. BASE64_NONWORD = "+/=" - FIRST_SPLIT_RE = re.compile(rf"([^\w'’{BASE64_NONWORD}]+)") - BASE64_NONWORD_RE = re.compile("[+/=]+") + FIRST_SPLIT_RE = re.compile(rf"([^\w'{APOSTROPHISH}{BASE64_NONWORD}]+)") + BASE64_NONWORD_RE = re.compile(rf"[{BASE64_NONWORD}]+") _TWOHEX = "[0-9a-fA-F]{2}" TWOHEX_RE = re.compile(_TWOHEX) @@ -88,7 +108,7 @@ def special_tokens(self) -> Iterable[str]: # XXX older bits MAXWORDLEN = 32 - WORD_RE = re.compile(r"(?:\w+['’]?)+") + WORD_RE = re.compile(rf"(?:\w+['{APOSTROPHISH}]?)+") HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I) DIGIT_RE = re.compile(r"\d") LONGEST_URLISH = 1024 # XXX?