Skip to content

Commit 5d83ddb

Browse files
committed
Recognize prefixed hex
1 parent 2c24a89 commit 5d83ddb

File tree

2 files changed

+36
-4
lines changed

2 files changed

+36
-4
lines changed

src/dom_tokenizers/pre_tokenizers/splitter.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,12 @@ def special_tokens(self) -> Iterable[str]:
6363
JS_CHAR_ESCAPE_RE = re.compile(f"(?:x|u{_TWOHEX}){_TWOHEX}")
6464
ENTITY_STARTS = {"&", "&#"}
6565
ESCAPE_START_RE = re.compile(r".([&%\\])")
66+
PREFIXED_HEX_RE = re.compile(r"^(0x)([0-9a-f]+)([+/=]*)$", re.I)
6667

6768
# XXX older bits
6869
MAXWORDLEN = 32
6970
WORD_RE = re.compile(r"(?:\w+['’]?)+")
70-
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
71+
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I)
7172
DIGIT_RE = re.compile(r"\d")
7273
LONGEST_URLISH = 1024 # XXX?
7374
URLISH_LOOKBACK = 5
@@ -159,6 +160,16 @@ def split(self, text: str) -> Iterable[str]:
159160
splits[cursor:cursor+1] = new_splits
160161
continue
161162

163+
# Are we looking at some prefixed hex?
164+
if (match := self.PREFIXED_HEX_RE.match(curr)):
165+
if VERBOSE: # pragma: no cover
166+
debug("prefixed hex")
167+
new_splits = [s for s in match.groups() if s]
168+
print("new_splits:", new_splits)
169+
splits[cursor:cursor+1] = new_splits
170+
cursor += len(new_splits)
171+
continue
172+
162173
# Are we looking at something that might be base64?
163174
if self.BASE64_RE.match(curr):
164175
cursor = self._sub_base64(splits, cursor)

tests/test_splitter.py

+24-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
@pytest.mark.parametrize(
7-
("text,expect_splits"),
7+
"text,expect_splits",
88
(("hello world", ["hello", " ", "world"]),
99
("$hello world", ["", "$", "hello", " ", "world"]),
1010
("hello-world", ["hello", "-", "world"]),
@@ -55,7 +55,7 @@ def test_first_split_re(text, expect_splits):
5555

5656

5757
@pytest.mark.parametrize(
58-
("text,expect_tokens"),
58+
"text,expect_tokens",
5959
(("hello world", ["hello", "world"]),
6060
("hello-world", ["hello", "world"]),
6161
("hello_world", ["hello", "world"]),
@@ -181,7 +181,28 @@ def test_decoding(text, expect_tokens):
181181

182182

183183
@pytest.mark.parametrize(
184-
("text,expect_tokens"),
184+
"text,expect_tokens",
185+
(("0x0", ["0x", "0"]),
186+
("0x1234", ["0x", "1234"]),
187+
("0x71c765", ["0x", "71c765"]),
188+
("0xdeadbeef",
189+
["0x", "[LONG]", "hex", "digits"]),
190+
("0xdeadbeefL", ["0xdeadbeefL"]),
191+
("0x4AAAAAAAAjq6WYeRDKmebM",
192+
["0x4AAAAAAAAjq6WYeRDKmebM"]),
193+
("0XPmYE28fJingEYE1hThk7F4SZFf1EVe2PxVNsmv",
194+
["[BASE64]"]),
195+
("0XBEA020C3BD417F30DE4D6BD05B0ED310AC586CC0",
196+
["0X", "[LONG]", "hex", "digits"]),
197+
))
198+
def test_prefixed_hex(text, expect_tokens):
199+
"""Ensure prefixed hex constants are recognized and split.
200+
"""
201+
assert list(TextSplitter().split(text)) == expect_tokens
202+
203+
204+
@pytest.mark.parametrize(
205+
"text,expect_tokens",
185206
(("That\u2019s all we know.",
186207
["That's", "all", "we", "know"]),
187208
("Page=Login&Action=Login\';\n\t\t\treturn",

0 commit comments

Comments
 (0)