Skip to content

Commit

Permalink
Recognize prefixed hex
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 4, 2024
1 parent 2c24a89 commit 322a504
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 4 deletions.
12 changes: 11 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ def special_tokens(self) -> Iterable[str]:
JS_CHAR_ESCAPE_RE = re.compile(f"(?:x|u{_TWOHEX}){_TWOHEX}")
ENTITY_STARTS = {"&", "&#"}
ESCAPE_START_RE = re.compile(r".([&%\\])")
PREFIXED_HEX_RE = re.compile(r"^(0x)([0-9a-f]+)([+/=]*)$", re.I)

# XXX older bits
MAXWORDLEN = 32
WORD_RE = re.compile(r"(?:\w+['’]?)+")
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I)
DIGIT_RE = re.compile(r"\d")
LONGEST_URLISH = 1024 # XXX?
URLISH_LOOKBACK = 5
Expand Down Expand Up @@ -159,6 +160,15 @@ def split(self, text: str) -> Iterable[str]:
splits[cursor:cursor+1] = new_splits
continue

# Are we looking at some prefixed hex?
if (match := self.PREFIXED_HEX_RE.match(curr)):
if VERBOSE: # pragma: no cover
debug("prefixed hex")
new_splits = [s for s in match.groups() if s]
splits[cursor:cursor+1] = new_splits
cursor += len(new_splits)
continue

# Are we looking at something that might be base64?
if self.BASE64_RE.match(curr):
cursor = self._sub_base64(splits, cursor)
Expand Down
27 changes: 24 additions & 3 deletions tests/test_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


@pytest.mark.parametrize(
("text,expect_splits"),
"text,expect_splits",
(("hello world", ["hello", " ", "world"]),
("$hello world", ["", "$", "hello", " ", "world"]),
("hello-world", ["hello", "-", "world"]),
Expand Down Expand Up @@ -55,7 +55,7 @@ def test_first_split_re(text, expect_splits):


@pytest.mark.parametrize(
("text,expect_tokens"),
"text,expect_tokens",
(("hello world", ["hello", "world"]),
("hello-world", ["hello", "world"]),
("hello_world", ["hello", "world"]),
Expand Down Expand Up @@ -181,7 +181,28 @@ def test_decoding(text, expect_tokens):


@pytest.mark.parametrize(
("text,expect_tokens"),
"text,expect_tokens",
(("0x0", ["0x", "0"]),
("0x1234", ["0x", "1234"]),
("0x71c765", ["0x", "71c765"]),
("0xdeadbeef",
["0x", "[LONG]", "hex", "digits"]),
("0xdeadbeefL", ["0xdeadbeefL"]),
("0x4AAAAAAAAjq6WYeRDKmebM",
["0x4AAAAAAAAjq6WYeRDKmebM"]),
("0XPmYE28fJingEYE1hThk7F4SZFf1EVe2PxVNsmv",
["[BASE64]"]),
("0XBEA020C3BD417F30DE4D6BD05B0ED310AC586CC0",
["0X", "[LONG]", "hex", "digits"]),
))
def test_prefixed_hex(text, expect_tokens):
"""Ensure prefixed hex constants are recognized and split.
"""
assert list(TextSplitter().split(text)) == expect_tokens


@pytest.mark.parametrize(
"text,expect_tokens",
(("That\u2019s all we know.",
["That's", "all", "we", "know"]),
("Page=Login&Action=Login\';\n\t\t\treturn",
Expand Down

0 comments on commit 322a504

Please sign in to comment.