Skip to content

Commit

Permalink
Handle more apostrophe surrogates
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 9, 2024
1 parent 378e50a commit 49994e2
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dom-tokenizers"
version = "0.0.15"
version = "0.0.16"
authors = [{ name = "Gary Benson", email = "[email protected]" }]
description = "DOM-aware tokenization for 🤗 Hugging Face language models"
readme = "README.md"
Expand Down
26 changes: 23 additions & 3 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,35 @@ class TextSplitter:
def special_tokens(self) -> Iterable[str]:
return (v for k, v in self.__dict__.items() if k.endswith("_token"))

_APOSTROPHISH = {
0x02b9, # Modifier letter prime
0x02bc, # Modifier letter apostrophe
0x02bf, # Modifier letter left half ring
0x02c8, # Modifier letter vertical line
0x055a, # Armenian apostrophe
0x05f3, # Hebrew punctuation geresh
0x1fbd, # Greek koronis
0x1fbf, # Greek psili
0x1ffd, # Greek oxia
0x2018, # Left single quotation mark
0x2019, # Right single quotation mark
0x201b, # Single high-reversed-9 quotation mark
0x2032, # Prime
0x275c, # Heavy single comma quotation mark ornament
0xff07, # Fullwidth apostrophe
}
APOSTROPHISH = "".join(map(chr, sorted(_APOSTROPHISH)))
APOSTROPHISH_RE = re.compile(rf"[{_APOSTROPHISH}]")

# Partially split into words, but retain the non-word characters
# until everything's de-escaped and base64 is identified.
# - `+/=` are allowed within words here to keep base64-encoded
# data in one "word".
# - Apostrophes are... included for now XXX
# - Underscores are included in "\w", so we have to handle them.
BASE64_NONWORD = "+/="
FIRST_SPLIT_RE = re.compile(rf"([^\w'{BASE64_NONWORD}]+)")
BASE64_NONWORD_RE = re.compile("[+/=]+")
FIRST_SPLIT_RE = re.compile(rf"([^\w'{APOSTROPHISH}{BASE64_NONWORD}]+)")
BASE64_NONWORD_RE = re.compile(rf"[{BASE64_NONWORD}]+")

_TWOHEX = "[0-9a-fA-F]{2}"
TWOHEX_RE = re.compile(_TWOHEX)
Expand All @@ -88,7 +108,7 @@ def special_tokens(self) -> Iterable[str]:

# XXX older bits
MAXWORDLEN = 32
WORD_RE = re.compile(r"(?:\w+[']?)+")
WORD_RE = re.compile(rf"(?:\w+['{APOSTROPHISH}]?)+")
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I)
DIGIT_RE = re.compile(r"\d")
LONGEST_URLISH = 1024 # XXX?
Expand Down

0 comments on commit 49994e2

Please sign in to comment.